1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=CHECK --check-prefix=X86
3; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=CHECK --check-prefix=X64
4
5; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/avx2-builtins.c
6
7define <4 x i64> @test_mm256_abs_epi8(<4 x i64> %a0) {
8; CHECK-LABEL: test_mm256_abs_epi8:
9; CHECK:       # %bb.0:
10; CHECK-NEXT:    vpabsb %ymm0, %ymm0
11; CHECK-NEXT:    ret{{[l|q]}}
12  %arg = bitcast <4 x i64> %a0 to <32 x i8>
13  %abs = call <32 x i8> @llvm.abs.v32i8(<32 x i8> %arg, i1 false)
14  %res = bitcast <32 x i8> %abs to <4 x i64>
15  ret <4 x i64> %res
16}
17declare <32 x i8> @llvm.abs.v32i8(<32 x i8>, i1) nounwind readnone
18
19define <4 x i64> @test_mm256_abs_epi16(<4 x i64> %a0) {
20; CHECK-LABEL: test_mm256_abs_epi16:
21; CHECK:       # %bb.0:
22; CHECK-NEXT:    vpabsw %ymm0, %ymm0
23; CHECK-NEXT:    ret{{[l|q]}}
24  %arg = bitcast <4 x i64> %a0 to <16 x i16>
25  %abs = call <16 x i16> @llvm.abs.v16i16(<16 x i16> %arg, i1 false)
26  %res = bitcast <16 x i16> %abs to <4 x i64>
27  ret <4 x i64> %res
28}
29declare <16 x i16> @llvm.abs.v16i16(<16 x i16>, i1) nounwind readnone
30
31define <4 x i64> @test_mm256_abs_epi32(<4 x i64> %a0) {
32; CHECK-LABEL: test_mm256_abs_epi32:
33; CHECK:       # %bb.0:
34; CHECK-NEXT:    vpabsd %ymm0, %ymm0
35; CHECK-NEXT:    ret{{[l|q]}}
36  %arg = bitcast <4 x i64> %a0 to <8 x i32>
37  %abs = call <8 x i32> @llvm.abs.v8i32(<8 x i32> %arg, i1 false)
38  %res = bitcast <8 x i32> %abs to <4 x i64>
39  ret <4 x i64> %res
40}
41declare <8 x i32> @llvm.abs.v8i32(<8 x i32>, i1) nounwind readnone
42
43define <4 x i64> @test_mm256_add_epi8(<4 x i64> %a0, <4 x i64> %a1) nounwind {
44; CHECK-LABEL: test_mm256_add_epi8:
45; CHECK:       # %bb.0:
46; CHECK-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
47; CHECK-NEXT:    ret{{[l|q]}}
48  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
49  %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
50  %res = add <32 x i8> %arg0, %arg1
51  %bc = bitcast <32 x i8> %res to <4 x i64>
52  ret <4 x i64> %bc
53}
54
55define <4 x i64> @test_mm256_add_epi16(<4 x i64> %a0, <4 x i64> %a1) nounwind {
56; CHECK-LABEL: test_mm256_add_epi16:
57; CHECK:       # %bb.0:
58; CHECK-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
59; CHECK-NEXT:    ret{{[l|q]}}
60  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
61  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
62  %res = add <16 x i16> %arg0, %arg1
63  %bc = bitcast <16 x i16> %res to <4 x i64>
64  ret <4 x i64> %bc
65}
66
67define <4 x i64> @test_mm256_add_epi32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
68; CHECK-LABEL: test_mm256_add_epi32:
69; CHECK:       # %bb.0:
70; CHECK-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
71; CHECK-NEXT:    ret{{[l|q]}}
72  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
73  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
74  %res = add <8 x i32> %arg0, %arg1
75  %bc = bitcast <8 x i32> %res to <4 x i64>
76  ret <4 x i64> %bc
77}
78
79define <4 x i64> @test_mm256_add_epi64(<4 x i64> %a0, <4 x i64> %a1) nounwind {
80; CHECK-LABEL: test_mm256_add_epi64:
81; CHECK:       # %bb.0:
82; CHECK-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
83; CHECK-NEXT:    ret{{[l|q]}}
84  %res = add <4 x i64> %a0, %a1
85  ret <4 x i64> %res
86}
87
88define <4 x i64> @test_mm256_adds_epi8(<4 x i64> %a0, <4 x i64> %a1) {
89; CHECK-LABEL: test_mm256_adds_epi8:
90; CHECK:       # %bb.0:
91; CHECK-NEXT:    vpaddsb %ymm1, %ymm0, %ymm0
92; CHECK-NEXT:    ret{{[l|q]}}
93  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
94  %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
95  %res = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> %arg0, <32 x i8> %arg1)
96  %bc = bitcast <32 x i8> %res to <4 x i64>
97  ret <4 x i64> %bc
98}
99declare <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8>, <32 x i8>) nounwind readnone
100
101define <4 x i64> @test_mm256_adds_epi16(<4 x i64> %a0, <4 x i64> %a1) {
102; CHECK-LABEL: test_mm256_adds_epi16:
103; CHECK:       # %bb.0:
104; CHECK-NEXT:    vpaddsw %ymm1, %ymm0, %ymm0
105; CHECK-NEXT:    ret{{[l|q]}}
106  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
107  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
108  %res = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> %arg0, <16 x i16> %arg1)
109  %bc = bitcast <16 x i16> %res to <4 x i64>
110  ret <4 x i64> %bc
111}
112declare <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16>, <16 x i16>) nounwind readnone
113
114define <4 x i64> @test_mm256_adds_epu8(<4 x i64> %a0, <4 x i64> %a1) {
115; CHECK-LABEL: test_mm256_adds_epu8:
116; CHECK:       # %bb.0:
117; CHECK-NEXT:    vpaddusb %ymm1, %ymm0, %ymm0
118; CHECK-NEXT:    ret{{[l|q]}}
119  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
120  %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
121  %res = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> %arg0, <32 x i8> %arg1)
122  %bc = bitcast <32 x i8> %res to <4 x i64>
123  ret <4 x i64> %bc
124}
125declare <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8>, <32 x i8>)
126
127define <4 x i64> @test_mm256_adds_epu16(<4 x i64> %a0, <4 x i64> %a1) {
128; CHECK-LABEL: test_mm256_adds_epu16:
129; CHECK:       # %bb.0:
130; CHECK-NEXT:    vpaddusw %ymm1, %ymm0, %ymm0
131; CHECK-NEXT:    ret{{[l|q]}}
132  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
133  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
134  %res = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> %arg0, <16 x i16> %arg1)
135  %bc = bitcast <16 x i16> %res to <4 x i64>
136  ret <4 x i64> %bc
137}
138declare <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16>, <16 x i16>)
139
140define <4 x i64> @test_mm256_alignr_epi8(<4 x i64> %a0, <4 x i64> %a1) {
141; CHECK-LABEL: test_mm256_alignr_epi8:
142; CHECK:       # %bb.0:
143; CHECK-NEXT:    vpalignr {{.*#+}} ymm0 = ymm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm1[0,1],ymm0[18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm1[16,17]
144; CHECK-NEXT:    ret{{[l|q]}}
145  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
146  %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
147  %shuf = shufflevector <32 x i8> %arg0, <32 x i8> %arg1, <32 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 33, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48, i32 49>
148  %res = bitcast <32 x i8> %shuf to <4 x i64>
149  ret <4 x i64> %res
150}
151
152define <4 x i64> @test2_mm256_alignr_epi8(<4 x i64> %a0, <4 x i64> %a1) {
153; CHECK-LABEL: test2_mm256_alignr_epi8:
154; CHECK:       # %bb.0:
155; CHECK-NEXT:    vpalignr {{.*#+}} ymm0 = ymm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm1[0],ymm0[17,18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm1[16]
156; CHECK-NEXT:    ret{{[l|q]}}
157  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
158  %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
159  %shuf = shufflevector <32 x i8> %arg0, <32 x i8> %arg1, <32 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48>
160  %res = bitcast <32 x i8> %shuf to <4 x i64>
161  ret <4 x i64> %res
162}
163
164define <4 x i64> @test_mm256_and_si256(<4 x i64> %a0, <4 x i64> %a1) nounwind {
165; CHECK-LABEL: test_mm256_and_si256:
166; CHECK:       # %bb.0:
167; CHECK-NEXT:    vandps %ymm1, %ymm0, %ymm0
168; CHECK-NEXT:    ret{{[l|q]}}
169  %res = and <4 x i64> %a0, %a1
170  ret <4 x i64> %res
171}
172
173define <4 x i64> @test_mm256_andnot_si256(<4 x i64> %a0, <4 x i64> %a1) nounwind {
174; CHECK-LABEL: test_mm256_andnot_si256:
175; CHECK:       # %bb.0:
176; CHECK-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
177; CHECK-NEXT:    vpxor %ymm2, %ymm0, %ymm0
178; CHECK-NEXT:    vpand %ymm1, %ymm0, %ymm0
179; CHECK-NEXT:    ret{{[l|q]}}
180  %not = xor <4 x i64> %a0, <i64 -1, i64 -1, i64 -1, i64 -1>
181  %res = and <4 x i64> %not, %a1
182  ret <4 x i64> %res
183}
184
185define <4 x i64> @test_mm256_avg_epu8(<4 x i64> %a0, <4 x i64> %a1) nounwind {
186; CHECK-LABEL: test_mm256_avg_epu8:
187; CHECK:       # %bb.0:
188; CHECK-NEXT:    vpavgb %ymm1, %ymm0, %ymm0
189; CHECK-NEXT:    ret{{[l|q]}}
190  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
191  %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
192  %res = call <32 x i8> @llvm.x86.avx2.pavg.b(<32 x i8> %arg0, <32 x i8> %arg1)
193  %bc = bitcast <32 x i8> %res to <4 x i64>
194  ret <4 x i64> %bc
195}
196declare <32 x i8> @llvm.x86.avx2.pavg.b(<32 x i8>, <32 x i8>) nounwind readnone
197
198define <4 x i64> @test_mm256_avg_epu16(<4 x i64> %a0, <4 x i64> %a1) nounwind {
199; CHECK-LABEL: test_mm256_avg_epu16:
200; CHECK:       # %bb.0:
201; CHECK-NEXT:    vpavgw %ymm1, %ymm0, %ymm0
202; CHECK-NEXT:    ret{{[l|q]}}
203  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
204  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
205  %res = call <16 x i16> @llvm.x86.avx2.pavg.w(<16 x i16> %arg0, <16 x i16> %arg1)
206  %bc = bitcast <16 x i16> %res to <4 x i64>
207  ret <4 x i64> %bc
208}
209declare <16 x i16> @llvm.x86.avx2.pavg.w(<16 x i16>, <16 x i16>) nounwind readnone
210
211define <4 x i64> @test_mm256_blend_epi16(<4 x i64> %a0, <4 x i64> %a1) {
212; CHECK-LABEL: test_mm256_blend_epi16:
213; CHECK:       # %bb.0:
214; CHECK-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6,7,8],ymm1[9],ymm0[10,11,12,13,14,15]
215; CHECK-NEXT:    ret{{[l|q]}}
216  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
217  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
218  %shuf = shufflevector <16 x i16> %arg0, <16 x i16> %arg1, <16 x i32> <i32 0, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 25, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
219  %res = bitcast <16 x i16> %shuf to <4 x i64>
220  ret <4 x i64> %res
221}
222
223define <2 x i64> @test_mm_blend_epi32(<2 x i64> %a0, <2 x i64> %a1) {
224; CHECK-LABEL: test_mm_blend_epi32:
225; CHECK:       # %bb.0:
226; CHECK-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
227; CHECK-NEXT:    ret{{[l|q]}}
228  %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
229  %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
230  %shuf = shufflevector <4 x i32> %arg0, <4 x i32> %arg1, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
231  %res = bitcast <4 x i32> %shuf to <2 x i64>
232  ret <2 x i64> %res
233}
234
235define <4 x i64> @test_mm256_blend_epi32(<4 x i64> %a0, <4 x i64> %a1) {
236; CHECK-LABEL: test_mm256_blend_epi32:
237; CHECK:       # %bb.0:
238; CHECK-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6,7]
239; CHECK-NEXT:    ret{{[l|q]}}
240  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
241  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
242  %shuf = shufflevector <8 x i32> %arg0, <8 x i32> %arg1, <8 x i32> <i32 8, i32 1, i32 10, i32 3, i32 12, i32 13, i32 6, i32 7>
243  %res = bitcast <8 x i32> %shuf to <4 x i64>
244  ret <4 x i64> %res
245}
246
247define <4 x i64> @test_mm256_blendv_epi8(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> %a2) {
248; CHECK-LABEL: test_mm256_blendv_epi8:
249; CHECK:       # %bb.0:
250; CHECK-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
251; CHECK-NEXT:    ret{{[l|q]}}
252  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
253  %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
254  %arg2 = bitcast <4 x i64> %a2 to <32 x i8>
255  %call = call <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8> %arg0, <32 x i8> %arg1, <32 x i8> %arg2)
256  %res = bitcast <32 x i8> %call to <4 x i64>
257  ret <4 x i64> %res
258}
259declare <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8>, <32 x i8>, <32 x i8>) nounwind readnone
260
261define <2 x i64> @test_mm_broadcastb_epi8(<2 x i64> %a0) {
262; CHECK-LABEL: test_mm_broadcastb_epi8:
263; CHECK:       # %bb.0:
264; CHECK-NEXT:    vpbroadcastb %xmm0, %xmm0
265; CHECK-NEXT:    ret{{[l|q]}}
266  %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
267  %shuf = shufflevector <16 x i8> %arg0, <16 x i8> undef, <16 x i32> zeroinitializer
268  %res = bitcast <16 x i8> %shuf to <2 x i64>
269  ret <2 x i64> %res
270}
271
272define <4 x i64> @test_mm256_broadcastb_epi8(<4 x i64> %a0) {
273; CHECK-LABEL: test_mm256_broadcastb_epi8:
274; CHECK:       # %bb.0:
275; CHECK-NEXT:    vpbroadcastb %xmm0, %ymm0
276; CHECK-NEXT:    ret{{[l|q]}}
277  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
278  %shuf = shufflevector <32 x i8> %arg0, <32 x i8> undef, <32 x i32> zeroinitializer
279  %res = bitcast <32 x i8> %shuf to <4 x i64>
280  ret <4 x i64> %res
281}
282
283define <2 x i64> @test_mm_broadcastd_epi32(<2 x i64> %a0) {
284; CHECK-LABEL: test_mm_broadcastd_epi32:
285; CHECK:       # %bb.0:
286; CHECK-NEXT:    vbroadcastss %xmm0, %xmm0
287; CHECK-NEXT:    ret{{[l|q]}}
288  %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
289  %shuf = shufflevector <4 x i32> %arg0, <4 x i32> undef, <4 x i32> zeroinitializer
290  %res = bitcast <4 x i32> %shuf to <2 x i64>
291  ret <2 x i64> %res
292}
293
294define <4 x i64> @test_mm256_broadcastd_epi32(<4 x i64> %a0) {
295; CHECK-LABEL: test_mm256_broadcastd_epi32:
296; CHECK:       # %bb.0:
297; CHECK-NEXT:    vbroadcastss %xmm0, %ymm0
298; CHECK-NEXT:    ret{{[l|q]}}
299  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
300  %shuf = shufflevector <8 x i32> %arg0, <8 x i32> undef, <8 x i32> zeroinitializer
301  %res = bitcast <8 x i32> %shuf to <4 x i64>
302  ret <4 x i64> %res
303}
304
305define <2 x i64> @test_mm_broadcastq_epi64(<2 x i64> %a0) {
306; CHECK-LABEL: test_mm_broadcastq_epi64:
307; CHECK:       # %bb.0:
308; CHECK-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
309; CHECK-NEXT:    ret{{[l|q]}}
310  %res = shufflevector <2 x i64> %a0, <2 x i64> undef, <2 x i32> zeroinitializer
311  ret <2 x i64> %res
312}
313
314define <4 x i64> @test_mm256_broadcastq_epi64(<4 x i64> %a0) {
315; CHECK-LABEL: test_mm256_broadcastq_epi64:
316; CHECK:       # %bb.0:
317; CHECK-NEXT:    vbroadcastsd %xmm0, %ymm0
318; CHECK-NEXT:    ret{{[l|q]}}
319  %res = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> zeroinitializer
320  ret <4 x i64> %res
321}
322
323define <2 x double> @test_mm_broadcastsd_pd(<2 x double> %a0) {
324; CHECK-LABEL: test_mm_broadcastsd_pd:
325; CHECK:       # %bb.0:
326; CHECK-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
327; CHECK-NEXT:    ret{{[l|q]}}
328  %res = shufflevector <2 x double> %a0, <2 x double> undef, <2 x i32> zeroinitializer
329  ret <2 x double> %res
330}
331
332define <4 x double> @test_mm256_broadcastsd_pd(<4 x double> %a0) {
333; CHECK-LABEL: test_mm256_broadcastsd_pd:
334; CHECK:       # %bb.0:
335; CHECK-NEXT:    vbroadcastsd %xmm0, %ymm0
336; CHECK-NEXT:    ret{{[l|q]}}
337  %res = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> zeroinitializer
338  ret <4 x double> %res
339}
340
341define <4 x i64> @test_mm256_broadcastsi128_si256(<2 x i64> %a0) {
342; CHECK-LABEL: test_mm256_broadcastsi128_si256:
343; CHECK:       # %bb.0:
344; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
345; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
346; CHECK-NEXT:    ret{{[l|q]}}
347  %res = shufflevector <2 x i64> %a0, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
348  ret <4 x i64> %res
349}
350
351define <4 x i64> @test_mm256_broadcastsi128_si256_mem(<2 x i64>* %p0) {
352; X86-LABEL: test_mm256_broadcastsi128_si256_mem:
353; X86:       # %bb.0:
354; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
355; X86-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
356; X86-NEXT:    retl
357;
358; X64-LABEL: test_mm256_broadcastsi128_si256_mem:
359; X64:       # %bb.0:
360; X64-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
361; X64-NEXT:    retq
362  %a0 = load <2 x i64>, <2 x i64>* %p0
363  %res = shufflevector <2 x i64> %a0, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
364  ret <4 x i64> %res
365}
366
367define <4 x float> @test_mm_broadcastss_ps(<4 x float> %a0) {
368; CHECK-LABEL: test_mm_broadcastss_ps:
369; CHECK:       # %bb.0:
370; CHECK-NEXT:    vbroadcastss %xmm0, %xmm0
371; CHECK-NEXT:    ret{{[l|q]}}
372  %res = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> zeroinitializer
373  ret <4 x float> %res
374}
375
376define <8 x float> @test_mm256_broadcastss_ps(<8 x float> %a0) {
377; CHECK-LABEL: test_mm256_broadcastss_ps:
378; CHECK:       # %bb.0:
379; CHECK-NEXT:    vbroadcastss %xmm0, %ymm0
380; CHECK-NEXT:    ret{{[l|q]}}
381  %res = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> zeroinitializer
382  ret <8 x float> %res
383}
384
385define <2 x i64> @test_mm_broadcastw_epi16(<2 x i64> %a0) {
386; CHECK-LABEL: test_mm_broadcastw_epi16:
387; CHECK:       # %bb.0:
388; CHECK-NEXT:    vpbroadcastw %xmm0, %xmm0
389; CHECK-NEXT:    ret{{[l|q]}}
390  %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
391  %shuf = shufflevector <8 x i16> %arg0, <8 x i16> undef, <8 x i32> zeroinitializer
392  %res = bitcast <8 x i16> %shuf to <2 x i64>
393  ret <2 x i64> %res
394}
395
396define <4 x i64> @test_mm256_broadcastw_epi16(<4 x i64> %a0) {
397; CHECK-LABEL: test_mm256_broadcastw_epi16:
398; CHECK:       # %bb.0:
399; CHECK-NEXT:    vpbroadcastw %xmm0, %ymm0
400; CHECK-NEXT:    ret{{[l|q]}}
401  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
402  %shuf = shufflevector <16 x i16> %arg0, <16 x i16> undef, <16 x i32> zeroinitializer
403  %res = bitcast <16 x i16> %shuf to <4 x i64>
404  ret <4 x i64> %res
405}
406
407define <4 x i64> @test_mm256_bslli_epi128(<4 x i64> %a0) {
408; CHECK-LABEL: test_mm256_bslli_epi128:
409; CHECK:       # %bb.0:
410; CHECK-NEXT:    vpslldq {{.*#+}} ymm0 = zero,zero,zero,ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12],zero,zero,zero,ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28]
411; CHECK-NEXT:    ret{{[l|q]}}
412  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
413  %shuf = shufflevector <32 x i8> zeroinitializer, <32 x i8> %arg0, <32 x i32> <i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60>
414  %res = bitcast <32 x i8> %shuf to <4 x i64>
415  ret <4 x i64> %res
416}
417
418define <4 x i64> @test_mm256_bsrli_epi128(<4 x i64> %a0) {
419; CHECK-LABEL: test_mm256_bsrli_epi128:
420; CHECK:       # %bb.0:
421; CHECK-NEXT:    vpsrldq {{.*#+}} ymm0 = ymm0[3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,ymm0[19,20,21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero
422; CHECK-NEXT:    ret{{[l|q]}}
423  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
424  %shuf = shufflevector <32 x i8> %arg0, <32 x i8> zeroinitializer, <32 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50>
425  %res = bitcast <32 x i8> %shuf to <4 x i64>
426  ret <4 x i64> %res
427}
428
429define <4 x i64> @test_mm256_cmpeq_epi8(<4 x i64> %a0, <4 x i64> %a1) nounwind {
430; CHECK-LABEL: test_mm256_cmpeq_epi8:
431; CHECK:       # %bb.0:
432; CHECK-NEXT:    vpcmpeqb %ymm1, %ymm0, %ymm0
433; CHECK-NEXT:    ret{{[l|q]}}
434  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
435  %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
436  %cmp = icmp eq <32 x i8> %arg0, %arg1
437  %res = sext <32 x i1> %cmp to <32 x i8>
438  %bc = bitcast <32 x i8> %res to <4 x i64>
439  ret <4 x i64> %bc
440}
441
442define <4 x i64> @test_mm256_cmpeq_epi16(<4 x i64> %a0, <4 x i64> %a1) nounwind {
443; CHECK-LABEL: test_mm256_cmpeq_epi16:
444; CHECK:       # %bb.0:
445; CHECK-NEXT:    vpcmpeqw %ymm1, %ymm0, %ymm0
446; CHECK-NEXT:    ret{{[l|q]}}
447  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
448  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
449  %cmp = icmp eq <16 x i16> %arg0, %arg1
450  %res = sext <16 x i1> %cmp to <16 x i16>
451  %bc = bitcast <16 x i16> %res to <4 x i64>
452  ret <4 x i64> %bc
453}
454
455define <4 x i64> @test_mm256_cmpeq_epi32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
456; CHECK-LABEL: test_mm256_cmpeq_epi32:
457; CHECK:       # %bb.0:
458; CHECK-NEXT:    vpcmpeqd %ymm1, %ymm0, %ymm0
459; CHECK-NEXT:    ret{{[l|q]}}
460  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
461  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
462  %cmp = icmp eq <8 x i32> %arg0, %arg1
463  %res = sext <8 x i1> %cmp to <8 x i32>
464  %bc = bitcast <8 x i32> %res to <4 x i64>
465  ret <4 x i64> %bc
466}
467
468define <4 x i64> @test_mm256_cmpeq_epi64(<4 x i64> %a0, <4 x i64> %a1) nounwind {
469; CHECK-LABEL: test_mm256_cmpeq_epi64:
470; CHECK:       # %bb.0:
471; CHECK-NEXT:    vpcmpeqq %ymm1, %ymm0, %ymm0
472; CHECK-NEXT:    ret{{[l|q]}}
473  %cmp = icmp eq <4 x i64> %a0, %a1
474  %res = sext <4 x i1> %cmp to <4 x i64>
475  ret <4 x i64> %res
476}
477
478define <4 x i64> @test_mm256_cmpgt_epi8(<4 x i64> %a0, <4 x i64> %a1) nounwind {
479; CHECK-LABEL: test_mm256_cmpgt_epi8:
480; CHECK:       # %bb.0:
481; CHECK-NEXT:    vpcmpgtb %ymm1, %ymm0, %ymm0
482; CHECK-NEXT:    ret{{[l|q]}}
483  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
484  %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
485  %cmp = icmp sgt <32 x i8> %arg0, %arg1
486  %res = sext <32 x i1> %cmp to <32 x i8>
487  %bc = bitcast <32 x i8> %res to <4 x i64>
488  ret <4 x i64> %bc
489}
490
491define <4 x i64> @test_mm256_cmpgt_epi16(<4 x i64> %a0, <4 x i64> %a1) nounwind {
492; CHECK-LABEL: test_mm256_cmpgt_epi16:
493; CHECK:       # %bb.0:
494; CHECK-NEXT:    vpcmpgtw %ymm1, %ymm0, %ymm0
495; CHECK-NEXT:    ret{{[l|q]}}
496  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
497  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
498  %cmp = icmp sgt <16 x i16> %arg0, %arg1
499  %res = sext <16 x i1> %cmp to <16 x i16>
500  %bc = bitcast <16 x i16> %res to <4 x i64>
501  ret <4 x i64> %bc
502}
503
504define <4 x i64> @test_mm256_cmpgt_epi32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
505; CHECK-LABEL: test_mm256_cmpgt_epi32:
506; CHECK:       # %bb.0:
507; CHECK-NEXT:    vpcmpgtd %ymm1, %ymm0, %ymm0
508; CHECK-NEXT:    ret{{[l|q]}}
509  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
510  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
511  %cmp = icmp sgt <8 x i32> %arg0, %arg1
512  %res = sext <8 x i1> %cmp to <8 x i32>
513  %bc = bitcast <8 x i32> %res to <4 x i64>
514  ret <4 x i64> %bc
515}
516
517define <4 x i64> @test_mm256_cmpgt_epi64(<4 x i64> %a0, <4 x i64> %a1) nounwind {
518; CHECK-LABEL: test_mm256_cmpgt_epi64:
519; CHECK:       # %bb.0:
520; CHECK-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm0
521; CHECK-NEXT:    ret{{[l|q]}}
522  %cmp = icmp sgt <4 x i64> %a0, %a1
523  %res = sext <4 x i1> %cmp to <4 x i64>
524  ret <4 x i64> %res
525}
526
527define <4 x i64> @test_mm256_cvtepi8_epi16(<2 x i64> %a0) {
528; CHECK-LABEL: test_mm256_cvtepi8_epi16:
529; CHECK:       # %bb.0:
530; CHECK-NEXT:    vpmovsxbw %xmm0, %ymm0
531; CHECK-NEXT:    ret{{[l|q]}}
532  %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
533  %ext = sext <16 x i8> %arg0 to <16 x i16>
534  %res = bitcast <16 x i16> %ext to <4 x i64>
535  ret <4 x i64> %res
536}
537
538define <4 x i64> @test_mm256_cvtepi8_epi32(<2 x i64> %a0) {
539; CHECK-LABEL: test_mm256_cvtepi8_epi32:
540; CHECK:       # %bb.0:
541; CHECK-NEXT:    vpmovsxbd %xmm0, %ymm0
542; CHECK-NEXT:    ret{{[l|q]}}
543  %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
544  %shuf = shufflevector <16 x i8> %arg0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
545  %ext = sext <8 x i8> %shuf to <8 x i32>
546  %res = bitcast <8 x i32> %ext to <4 x i64>
547  ret <4 x i64> %res
548}
549
550define <4 x i64> @test_mm256_cvtepi8_epi64(<2 x i64> %a0) {
551; CHECK-LABEL: test_mm256_cvtepi8_epi64:
552; CHECK:       # %bb.0:
553; CHECK-NEXT:    vpmovsxbq %xmm0, %ymm0
554; CHECK-NEXT:    ret{{[l|q]}}
555  %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
556  %shuf = shufflevector <16 x i8> %arg0, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
557  %ext = sext <4 x i8> %shuf to <4 x i64>
558  ret <4 x i64> %ext
559}
560
561define <4 x i64> @test_mm256_cvtepi16_epi32(<2 x i64> %a0) {
562; CHECK-LABEL: test_mm256_cvtepi16_epi32:
563; CHECK:       # %bb.0:
564; CHECK-NEXT:    vpmovsxwd %xmm0, %ymm0
565; CHECK-NEXT:    ret{{[l|q]}}
566  %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
567  %ext = sext <8 x i16> %arg0 to <8 x i32>
568  %res = bitcast <8 x i32> %ext to <4 x i64>
569  ret <4 x i64> %res
570}
571
572define <4 x i64> @test_mm256_cvtepi16_epi64(<2 x i64> %a0) {
573; CHECK-LABEL: test_mm256_cvtepi16_epi64:
574; CHECK:       # %bb.0:
575; CHECK-NEXT:    vpmovsxwq %xmm0, %ymm0
576; CHECK-NEXT:    ret{{[l|q]}}
577  %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
578  %shuf = shufflevector <8 x i16> %arg0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
579  %ext = sext <4 x i16> %shuf to <4 x i64>
580  ret <4 x i64> %ext
581}
582
583define <4 x i64> @test_mm256_cvtepi32_epi64(<2 x i64> %a0) {
584; CHECK-LABEL: test_mm256_cvtepi32_epi64:
585; CHECK:       # %bb.0:
586; CHECK-NEXT:    vpmovsxdq %xmm0, %ymm0
587; CHECK-NEXT:    ret{{[l|q]}}
588  %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
589  %ext = sext <4 x i32> %arg0 to <4 x i64>
590  ret <4 x i64> %ext
591}
592
593define <4 x i64> @test_mm256_cvtepu8_epi16(<2 x i64> %a0) {
594; CHECK-LABEL: test_mm256_cvtepu8_epi16:
595; CHECK:       # %bb.0:
596; CHECK-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
597; CHECK-NEXT:    ret{{[l|q]}}
598  %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
599  %ext = zext <16 x i8> %arg0 to <16 x i16>
600  %res = bitcast <16 x i16> %ext to <4 x i64>
601  ret <4 x i64> %res
602}
603
604define <4 x i64> @test_mm256_cvtepu8_epi32(<2 x i64> %a0) {
605; CHECK-LABEL: test_mm256_cvtepu8_epi32:
606; CHECK:       # %bb.0:
607; CHECK-NEXT:    vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
608; CHECK-NEXT:    ret{{[l|q]}}
609  %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
610  %shuf = shufflevector <16 x i8> %arg0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
611  %ext = zext <8 x i8> %shuf to <8 x i32>
612  %res = bitcast <8 x i32> %ext to <4 x i64>
613  ret <4 x i64> %res
614}
615
616define <4 x i64> @test_mm256_cvtepu8_epi64(<2 x i64> %a0) {
617; CHECK-LABEL: test_mm256_cvtepu8_epi64:
618; CHECK:       # %bb.0:
619; CHECK-NEXT:    vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
620; CHECK-NEXT:    ret{{[l|q]}}
621  %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
622  %shuf = shufflevector <16 x i8> %arg0, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
623  %ext = zext <4 x i8> %shuf to <4 x i64>
624  ret <4 x i64> %ext
625}
626
627define <4 x i64> @test_mm256_cvtepu16_epi32(<2 x i64> %a0) {
628; CHECK-LABEL: test_mm256_cvtepu16_epi32:
629; CHECK:       # %bb.0:
630; CHECK-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
631; CHECK-NEXT:    ret{{[l|q]}}
632  %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
633  %ext = zext <8 x i16> %arg0 to <8 x i32>
634  %res = bitcast <8 x i32> %ext to <4 x i64>
635  ret <4 x i64> %res
636}
637
638define <4 x i64> @test_mm256_cvtepu16_epi64(<2 x i64> %a0) {
639; CHECK-LABEL: test_mm256_cvtepu16_epi64:
640; CHECK:       # %bb.0:
641; CHECK-NEXT:    vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
642; CHECK-NEXT:    ret{{[l|q]}}
643  %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
644  %shuf = shufflevector <8 x i16> %arg0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
645  %ext = zext <4 x i16> %shuf to <4 x i64>
646  ret <4 x i64> %ext
647}
648
649define <4 x i64> @test_mm256_cvtepu32_epi64(<2 x i64> %a0) {
650; CHECK-LABEL: test_mm256_cvtepu32_epi64:
651; CHECK:       # %bb.0:
652; CHECK-NEXT:    vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
653; CHECK-NEXT:    ret{{[l|q]}}
654  %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
655  %ext = zext <4 x i32> %arg0 to <4 x i64>
656  ret <4 x i64> %ext
657}
658
659define <2 x i64> @test_mm256_extracti128_si256(<4 x i64> %a0) nounwind {
660; CHECK-LABEL: test_mm256_extracti128_si256:
661; CHECK:       # %bb.0:
662; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
663; CHECK-NEXT:    vzeroupper
664; CHECK-NEXT:    ret{{[l|q]}}
665  %res = shufflevector <4 x i64> %a0, <4 x i64> %a0, <2 x i32> <i32 2, i32 3>
666  ret <2 x i64> %res
667}
668
669define <4 x i64> @test_mm256_hadd_epi16(<4 x i64> %a0, <4 x i64> %a1) {
670; CHECK-LABEL: test_mm256_hadd_epi16:
671; CHECK:       # %bb.0:
672; CHECK-NEXT:    vphaddw %ymm1, %ymm0, %ymm0
673; CHECK-NEXT:    ret{{[l|q]}}
674  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
675  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
676  %res = call <16 x i16> @llvm.x86.avx2.phadd.w(<16 x i16> %arg0, <16 x i16> %arg1)
677  %bc = bitcast <16 x i16> %res to <4 x i64>
678  ret <4 x i64> %bc
679}
680declare <16 x i16> @llvm.x86.avx2.phadd.w(<16 x i16>, <16 x i16>) nounwind readnone
681
682define <4 x i64> @test_mm256_hadd_epi32(<4 x i64> %a0, <4 x i64> %a1) {
683; CHECK-LABEL: test_mm256_hadd_epi32:
684; CHECK:       # %bb.0:
685; CHECK-NEXT:    vphaddd %ymm1, %ymm0, %ymm0
686; CHECK-NEXT:    ret{{[l|q]}}
687  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
688  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
689  %res = call <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32> %arg0, <8 x i32> %arg1)
690  %bc = bitcast <8 x i32> %res to <4 x i64>
691  ret <4 x i64> %bc
692}
693declare <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32>, <8 x i32>) nounwind readnone
694
695define <4 x i64> @test_mm256_hadds_epi16(<4 x i64> %a0, <4 x i64> %a1) {
696; CHECK-LABEL: test_mm256_hadds_epi16:
697; CHECK:       # %bb.0:
698; CHECK-NEXT:    vphaddsw %ymm1, %ymm0, %ymm0
699; CHECK-NEXT:    ret{{[l|q]}}
700  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
701  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
702  %res = call <16 x i16> @llvm.x86.avx2.phadd.sw(<16 x i16> %arg0, <16 x i16> %arg1)
703  %bc = bitcast <16 x i16> %res to <4 x i64>
704  ret <4 x i64> %bc
705}
706declare <16 x i16> @llvm.x86.avx2.phadd.sw(<16 x i16>, <16 x i16>) nounwind readnone
707
708define <4 x i64> @test_mm256_hsub_epi16(<4 x i64> %a0, <4 x i64> %a1) {
709; CHECK-LABEL: test_mm256_hsub_epi16:
710; CHECK:       # %bb.0:
711; CHECK-NEXT:    vphsubw %ymm1, %ymm0, %ymm0
712; CHECK-NEXT:    ret{{[l|q]}}
713  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
714  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
715  %res = call <16 x i16> @llvm.x86.avx2.phsub.w(<16 x i16> %arg0, <16 x i16> %arg1)
716  %bc = bitcast <16 x i16> %res to <4 x i64>
717  ret <4 x i64> %bc
718}
719declare <16 x i16> @llvm.x86.avx2.phsub.w(<16 x i16>, <16 x i16>) nounwind readnone
720
721define <4 x i64> @test_mm256_hsub_epi32(<4 x i64> %a0, <4 x i64> %a1) {
722; CHECK-LABEL: test_mm256_hsub_epi32:
723; CHECK:       # %bb.0:
724; CHECK-NEXT:    vphsubd %ymm1, %ymm0, %ymm0
725; CHECK-NEXT:    ret{{[l|q]}}
726  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
727  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
728  %res = call <8 x i32> @llvm.x86.avx2.phsub.d(<8 x i32> %arg0, <8 x i32> %arg1)
729  %bc = bitcast <8 x i32> %res to <4 x i64>
730  ret <4 x i64> %bc
731}
732declare <8 x i32> @llvm.x86.avx2.phsub.d(<8 x i32>, <8 x i32>) nounwind readnone
733
734define <4 x i64> @test_mm256_hsubs_epi16(<4 x i64> %a0, <4 x i64> %a1) {
735; CHECK-LABEL: test_mm256_hsubs_epi16:
736; CHECK:       # %bb.0:
737; CHECK-NEXT:    vphsubsw %ymm1, %ymm0, %ymm0
738; CHECK-NEXT:    ret{{[l|q]}}
739  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
740  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
741  %res = call <16 x i16> @llvm.x86.avx2.phsub.sw(<16 x i16> %arg0, <16 x i16> %arg1)
742  %bc = bitcast <16 x i16> %res to <4 x i64>
743  ret <4 x i64> %bc
744}
745declare <16 x i16> @llvm.x86.avx2.phsub.sw(<16 x i16>, <16 x i16>) nounwind readnone
746
747define <2 x i64> @test_mm_i32gather_epi32(i32 *%a0, <2 x i64> %a1) {
748; X86-LABEL: test_mm_i32gather_epi32:
749; X86:       # %bb.0:
750; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
751; X86-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
752; X86-NEXT:    vpxor %xmm1, %xmm1, %xmm1
753; X86-NEXT:    vpgatherdd %xmm2, (%eax,%xmm0,2), %xmm1
754; X86-NEXT:    vmovdqa %xmm1, %xmm0
755; X86-NEXT:    retl
756;
757; X64-LABEL: test_mm_i32gather_epi32:
758; X64:       # %bb.0:
759; X64-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
760; X64-NEXT:    vpxor %xmm1, %xmm1, %xmm1
761; X64-NEXT:    vpgatherdd %xmm2, (%rdi,%xmm0,2), %xmm1
762; X64-NEXT:    vmovdqa %xmm1, %xmm0
763; X64-NEXT:    retq
764  %arg0 = bitcast i32 *%a0 to i8*
765  %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
766  %mask = bitcast <2 x i64> <i64 -1, i64 -1> to <4 x i32>
767  %call = call <4 x i32> @llvm.x86.avx2.gather.d.d(<4 x i32> undef, i8* %arg0, <4 x i32> %arg1, <4 x i32> %mask, i8 2)
768  %bc = bitcast <4 x i32> %call to <2 x i64>
769  ret <2 x i64> %bc
770}
771declare <4 x i32> @llvm.x86.avx2.gather.d.d(<4 x i32>, i8*, <4 x i32>, <4 x i32>, i8) nounwind readonly
772
773define <2 x i64> @test_mm_mask_i32gather_epi32(<2 x i64> %a0, i32 *%a1, <2 x i64> %a2, <2 x i64> %a3) {
774; X86-LABEL: test_mm_mask_i32gather_epi32:
775; X86:       # %bb.0:
776; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
777; X86-NEXT:    vpgatherdd %xmm2, (%eax,%xmm1,2), %xmm0
778; X86-NEXT:    retl
779;
780; X64-LABEL: test_mm_mask_i32gather_epi32:
781; X64:       # %bb.0:
782; X64-NEXT:    vpgatherdd %xmm2, (%rdi,%xmm1,2), %xmm0
783; X64-NEXT:    retq
784  %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
785  %arg1 = bitcast i32 *%a1 to i8*
786  %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
787  %arg3 = bitcast <2 x i64> %a3 to <4 x i32>
788  %call = call <4 x i32> @llvm.x86.avx2.gather.d.d(<4 x i32> %arg0, i8* %arg1, <4 x i32> %arg2, <4 x i32> %arg3, i8 2)
789  %bc = bitcast <4 x i32> %call to <2 x i64>
790  ret <2 x i64> %bc
791}
792
793define <4 x i64> @test_mm256_i32gather_epi32(i32 *%a0, <4 x i64> %a1) {
794; X86-LABEL: test_mm256_i32gather_epi32:
795; X86:       # %bb.0:
796; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
797; X86-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
798; X86-NEXT:    vpxor %xmm1, %xmm1, %xmm1
799; X86-NEXT:    vpgatherdd %ymm2, (%eax,%ymm0,2), %ymm1
800; X86-NEXT:    vmovdqa %ymm1, %ymm0
801; X86-NEXT:    retl
802;
803; X64-LABEL: test_mm256_i32gather_epi32:
804; X64:       # %bb.0:
805; X64-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
806; X64-NEXT:    vpxor %xmm1, %xmm1, %xmm1
807; X64-NEXT:    vpgatherdd %ymm2, (%rdi,%ymm0,2), %ymm1
808; X64-NEXT:    vmovdqa %ymm1, %ymm0
809; X64-NEXT:    retq
810  %arg0 = bitcast i32 *%a0 to i8*
811  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
812  %mask = bitcast <4 x i64> <i64 -1, i64 -1, i64 -1, i64 -1> to <8 x i32>
813  %call = call <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> undef, i8* %arg0, <8 x i32> %arg1, <8 x i32> %mask, i8 2)
814  %bc = bitcast <8 x i32> %call to <4 x i64>
815  ret <4 x i64> %bc
816}
817declare <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32>, i8*, <8 x i32>, <8 x i32>, i8) nounwind readonly
818
819define <4 x i64> @test_mm256_mask_i32gather_epi32(<4 x i64> %a0, i32 *%a1, <4 x i64> %a2, <4 x i64> %a3) {
820; X86-LABEL: test_mm256_mask_i32gather_epi32:
821; X86:       # %bb.0:
822; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
823; X86-NEXT:    vpgatherdd %ymm2, (%eax,%ymm1,2), %ymm0
824; X86-NEXT:    retl
825;
826; X64-LABEL: test_mm256_mask_i32gather_epi32:
827; X64:       # %bb.0:
828; X64-NEXT:    vpgatherdd %ymm2, (%rdi,%ymm1,2), %ymm0
829; X64-NEXT:    retq
830  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
831  %arg1 = bitcast i32 *%a1 to i8*
832  %arg2 = bitcast <4 x i64> %a2 to <8 x i32>
833  %arg3 = bitcast <4 x i64> %a3 to <8 x i32>
834  %call = call <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> %arg0, i8* %arg1, <8 x i32> %arg2, <8 x i32> %arg3, i8 2)
835  %bc = bitcast <8 x i32> %call to <4 x i64>
836  ret <4 x i64> %bc
837}
838
839define <2 x i64> @test_mm_i32gather_epi64(i64 *%a0, <2 x i64> %a1) {
840; X86-LABEL: test_mm_i32gather_epi64:
841; X86:       # %bb.0:
842; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
843; X86-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
844; X86-NEXT:    vpxor %xmm1, %xmm1, %xmm1
845; X86-NEXT:    vpgatherdq %xmm2, (%eax,%xmm0,2), %xmm1
846; X86-NEXT:    vmovdqa %xmm1, %xmm0
847; X86-NEXT:    retl
848;
849; X64-LABEL: test_mm_i32gather_epi64:
850; X64:       # %bb.0:
851; X64-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
852; X64-NEXT:    vpxor %xmm1, %xmm1, %xmm1
853; X64-NEXT:    vpgatherdq %xmm2, (%rdi,%xmm0,2), %xmm1
854; X64-NEXT:    vmovdqa %xmm1, %xmm0
855; X64-NEXT:    retq
856  %arg0 = bitcast i64 *%a0 to i8*
857  %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
858  %res = call <2 x i64> @llvm.x86.avx2.gather.d.q(<2 x i64> undef, i8* %arg0, <4 x i32> %arg1, <2 x i64> <i64 -1, i64 -1>, i8 2)
859  ret <2 x i64> %res
860}
861declare <2 x i64> @llvm.x86.avx2.gather.d.q(<2 x i64>, i8*, <4 x i32>, <2 x i64>, i8) nounwind readonly
862
863define <2 x i64> @test_mm_mask_i32gather_epi64(<2 x i64> %a0, i64 *%a1, <2 x i64> %a2, <2 x i64> %a3) {
864; X86-LABEL: test_mm_mask_i32gather_epi64:
865; X86:       # %bb.0:
866; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
867; X86-NEXT:    vpgatherdq %xmm2, (%eax,%xmm1,2), %xmm0
868; X86-NEXT:    retl
869;
870; X64-LABEL: test_mm_mask_i32gather_epi64:
871; X64:       # %bb.0:
872; X64-NEXT:    vpgatherdq %xmm2, (%rdi,%xmm1,2), %xmm0
873; X64-NEXT:    retq
874  %arg1 = bitcast i64 *%a1 to i8*
875  %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
876  %res = call <2 x i64> @llvm.x86.avx2.gather.d.q(<2 x i64> %a0, i8* %arg1, <4 x i32> %arg2, <2 x i64> %a3, i8 2)
877  ret <2 x i64> %res
878}
879
880define <4 x i64> @test_mm256_i32gather_epi64(i64 *%a0, <2 x i64> %a1) {
881; X86-LABEL: test_mm256_i32gather_epi64:
882; X86:       # %bb.0:
883; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
884; X86-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
885; X86-NEXT:    vpxor %xmm1, %xmm1, %xmm1
886; X86-NEXT:    vpgatherdq %ymm2, (%eax,%xmm0,2), %ymm1
887; X86-NEXT:    vmovdqa %ymm1, %ymm0
888; X86-NEXT:    retl
889;
890; X64-LABEL: test_mm256_i32gather_epi64:
891; X64:       # %bb.0:
892; X64-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
893; X64-NEXT:    vpxor %xmm1, %xmm1, %xmm1
894; X64-NEXT:    vpgatherdq %ymm2, (%rdi,%xmm0,2), %ymm1
895; X64-NEXT:    vmovdqa %ymm1, %ymm0
896; X64-NEXT:    retq
897  %arg0 = bitcast i64 *%a0 to i8*
898  %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
899  %res = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8* %arg0, <4 x i32> %arg1, <4 x i64> <i64 -1, i64 -1, i64 -1, i64 -1>, i8 2)
900  ret <4 x i64> %res
901}
902declare <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64>, i8*, <4 x i32>, <4 x i64>, i8) nounwind readonly
903
904define <4 x i64> @test_mm256_mask_i32gather_epi64(<4 x i64> %a0, i64 *%a1, <2 x i64> %a2, <4 x i64> %a3) {
905; X86-LABEL: test_mm256_mask_i32gather_epi64:
906; X86:       # %bb.0:
907; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
908; X86-NEXT:    vpgatherdq %ymm2, (%eax,%xmm1,2), %ymm0
909; X86-NEXT:    retl
910;
911; X64-LABEL: test_mm256_mask_i32gather_epi64:
912; X64:       # %bb.0:
913; X64-NEXT:    vpgatherdq %ymm2, (%rdi,%xmm1,2), %ymm0
914; X64-NEXT:    retq
915  %arg1 = bitcast i64 *%a1 to i8*
916  %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
917  %res = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> %a0, i8* %arg1, <4 x i32> %arg2, <4 x i64> %a3, i8 2)
918  ret <4 x i64> %res
919}
920
921define <2 x double> @test_mm_i32gather_pd(double *%a0, <2 x i64> %a1) {
922; X86-LABEL: test_mm_i32gather_pd:
923; X86:       # %bb.0:
924; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
925; X86-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
926; X86-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
927; X86-NEXT:    vgatherdpd %xmm2, (%eax,%xmm0,2), %xmm1
928; X86-NEXT:    vmovapd %xmm1, %xmm0
929; X86-NEXT:    retl
930;
931; X64-LABEL: test_mm_i32gather_pd:
932; X64:       # %bb.0:
933; X64-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
934; X64-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
935; X64-NEXT:    vgatherdpd %xmm2, (%rdi,%xmm0,2), %xmm1
936; X64-NEXT:    vmovapd %xmm1, %xmm0
937; X64-NEXT:    retq
938  %arg0 = bitcast double *%a0 to i8*
939  %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
940  %cmp = fcmp oeq <2 x double> zeroinitializer, zeroinitializer
941  %sext = sext <2 x i1> %cmp to <2 x i64>
942  %mask = bitcast <2 x i64> %sext to <2 x double>
943  %res = call <2 x double> @llvm.x86.avx2.gather.d.pd(<2 x double> undef, i8* %arg0, <4 x i32> %arg1, <2 x double> %mask, i8 2)
944  ret <2 x double> %res
945}
946declare <2 x double> @llvm.x86.avx2.gather.d.pd(<2 x double>, i8*, <4 x i32>, <2 x double>, i8) nounwind readonly
947
948define <2 x double> @test_mm_mask_i32gather_pd(<2 x double> %a0, double *%a1, <2 x i64> %a2, <2 x double> %a3) {
949; X86-LABEL: test_mm_mask_i32gather_pd:
950; X86:       # %bb.0:
951; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
952; X86-NEXT:    vgatherdpd %xmm2, (%eax,%xmm1,2), %xmm0
953; X86-NEXT:    retl
954;
955; X64-LABEL: test_mm_mask_i32gather_pd:
956; X64:       # %bb.0:
957; X64-NEXT:    vgatherdpd %xmm2, (%rdi,%xmm1,2), %xmm0
958; X64-NEXT:    retq
959  %arg1 = bitcast double *%a1 to i8*
960  %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
961  %res = call <2 x double> @llvm.x86.avx2.gather.d.pd(<2 x double> %a0, i8* %arg1, <4 x i32> %arg2, <2 x double> %a3, i8 2)
962  ret <2 x double> %res
963}
964
965define <4 x double> @test_mm256_i32gather_pd(double *%a0, <2 x i64> %a1) {
966; X86-LABEL: test_mm256_i32gather_pd:
967; X86:       # %bb.0:
968; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
969; X86-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
970; X86-NEXT:    vcmpeqpd %ymm1, %ymm1, %ymm2
971; X86-NEXT:    vgatherdpd %ymm2, (%eax,%xmm0,2), %ymm1
972; X86-NEXT:    vmovapd %ymm1, %ymm0
973; X86-NEXT:    retl
974;
975; X64-LABEL: test_mm256_i32gather_pd:
976; X64:       # %bb.0:
977; X64-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
978; X64-NEXT:    vcmpeqpd %ymm1, %ymm1, %ymm2
979; X64-NEXT:    vgatherdpd %ymm2, (%rdi,%xmm0,2), %ymm1
980; X64-NEXT:    vmovapd %ymm1, %ymm0
981; X64-NEXT:    retq
982  %arg0 = bitcast double *%a0 to i8*
983  %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
984  %mask = call <4 x double> @llvm.x86.avx.cmp.pd.256(<4 x double> zeroinitializer, <4 x double> zeroinitializer, i8 0)
985  %res = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8* %arg0, <4 x i32> %arg1, <4 x double> %mask, i8 2)
986  ret <4 x double> %res
987}
988declare <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double>, i8*, <4 x i32>, <4 x double>, i8) nounwind readonly
989
990define <4 x double> @test_mm256_mask_i32gather_pd(<4 x double> %a0, double *%a1, <2 x i64> %a2, <4 x double> %a3) {
991; X86-LABEL: test_mm256_mask_i32gather_pd:
992; X86:       # %bb.0:
993; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
994; X86-NEXT:    vgatherdpd %ymm2, (%eax,%xmm1,2), %ymm0
995; X86-NEXT:    retl
996;
997; X64-LABEL: test_mm256_mask_i32gather_pd:
998; X64:       # %bb.0:
999; X64-NEXT:    vgatherdpd %ymm2, (%rdi,%xmm1,2), %ymm0
1000; X64-NEXT:    retq
1001  %arg1 = bitcast double *%a1 to i8*
1002  %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
1003  %res = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> %a0, i8* %arg1, <4 x i32> %arg2, <4 x double> %a3, i8 2)
1004  ret <4 x double> %res
1005}
1006
1007define <4 x float> @test_mm_i32gather_ps(float *%a0, <2 x i64> %a1) {
1008; X86-LABEL: test_mm_i32gather_ps:
1009; X86:       # %bb.0:
1010; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1011; X86-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1012; X86-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1013; X86-NEXT:    vgatherdps %xmm2, (%eax,%xmm0,2), %xmm1
1014; X86-NEXT:    vmovaps %xmm1, %xmm0
1015; X86-NEXT:    retl
1016;
1017; X64-LABEL: test_mm_i32gather_ps:
1018; X64:       # %bb.0:
1019; X64-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1020; X64-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1021; X64-NEXT:    vgatherdps %xmm2, (%rdi,%xmm0,2), %xmm1
1022; X64-NEXT:    vmovaps %xmm1, %xmm0
1023; X64-NEXT:    retq
1024  %arg0 = bitcast float *%a0 to i8*
1025  %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
1026  %cmp = fcmp oeq <4 x float> zeroinitializer, zeroinitializer
1027  %sext = sext <4 x i1> %cmp to <4 x i32>
1028  %mask = bitcast <4 x i32> %sext to <4 x float>
1029  %call = call <4 x float> @llvm.x86.avx2.gather.d.ps(<4 x float> undef, i8* %arg0, <4 x i32> %arg1, <4 x float> %mask, i8 2)
1030  ret <4 x float> %call
1031}
1032declare <4 x float> @llvm.x86.avx2.gather.d.ps(<4 x float>, i8*, <4 x i32>, <4 x float>, i8) nounwind readonly
1033
1034define <4 x float> @test_mm_mask_i32gather_ps(<4 x float> %a0, float *%a1, <2 x i64> %a2, <4 x float> %a3) {
1035; X86-LABEL: test_mm_mask_i32gather_ps:
1036; X86:       # %bb.0:
1037; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1038; X86-NEXT:    vgatherdps %xmm2, (%eax,%xmm1,2), %xmm0
1039; X86-NEXT:    retl
1040;
1041; X64-LABEL: test_mm_mask_i32gather_ps:
1042; X64:       # %bb.0:
1043; X64-NEXT:    vgatherdps %xmm2, (%rdi,%xmm1,2), %xmm0
1044; X64-NEXT:    retq
1045  %arg1 = bitcast float *%a1 to i8*
1046  %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
1047  %call = call <4 x float> @llvm.x86.avx2.gather.d.ps(<4 x float> %a0, i8* %arg1, <4 x i32> %arg2, <4 x float> %a3, i8 2)
1048  ret <4 x float> %call
1049}
1050
1051define <8 x float> @test_mm256_i32gather_ps(float *%a0, <4 x i64> %a1) {
1052; X86-LABEL: test_mm256_i32gather_ps:
1053; X86:       # %bb.0:
1054; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1055; X86-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1056; X86-NEXT:    vcmpeqps %ymm1, %ymm1, %ymm2
1057; X86-NEXT:    vgatherdps %ymm2, (%eax,%ymm0,2), %ymm1
1058; X86-NEXT:    vmovaps %ymm1, %ymm0
1059; X86-NEXT:    retl
1060;
1061; X64-LABEL: test_mm256_i32gather_ps:
1062; X64:       # %bb.0:
1063; X64-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1064; X64-NEXT:    vcmpeqps %ymm1, %ymm1, %ymm2
1065; X64-NEXT:    vgatherdps %ymm2, (%rdi,%ymm0,2), %ymm1
1066; X64-NEXT:    vmovaps %ymm1, %ymm0
1067; X64-NEXT:    retq
1068  %arg0 = bitcast float *%a0 to i8*
1069  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1070  %mask = call <8 x float> @llvm.x86.avx.cmp.ps.256(<8 x float> zeroinitializer, <8 x float> zeroinitializer, i8 0)
1071  %call = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> undef, i8* %arg0, <8 x i32> %arg1, <8 x float> %mask, i8 2)
1072  ret <8 x float> %call
1073}
1074declare <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float>, i8*, <8 x i32>, <8 x float>, i8) nounwind readonly
1075
1076define <8 x float> @test_mm256_mask_i32gather_ps(<8 x float> %a0, float *%a1, <4 x i64> %a2, <8 x float> %a3) {
1077; X86-LABEL: test_mm256_mask_i32gather_ps:
1078; X86:       # %bb.0:
1079; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1080; X86-NEXT:    vgatherdps %ymm2, (%eax,%ymm1,2), %ymm0
1081; X86-NEXT:    retl
1082;
1083; X64-LABEL: test_mm256_mask_i32gather_ps:
1084; X64:       # %bb.0:
1085; X64-NEXT:    vgatherdps %ymm2, (%rdi,%ymm1,2), %ymm0
1086; X64-NEXT:    retq
1087  %arg1 = bitcast float *%a1 to i8*
1088  %arg2 = bitcast <4 x i64> %a2 to <8 x i32>
1089  %call = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> %a0, i8* %arg1, <8 x i32> %arg2, <8 x float> %a3, i8 2)
1090  ret <8 x float> %call
1091}
1092
1093define <2 x i64> @test_mm_i64gather_epi32(i32 *%a0, <2 x i64> %a1) {
1094; X86-LABEL: test_mm_i64gather_epi32:
1095; X86:       # %bb.0:
1096; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1097; X86-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1098; X86-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1099; X86-NEXT:    vpgatherqd %xmm2, (%eax,%xmm0,2), %xmm1
1100; X86-NEXT:    vmovdqa %xmm1, %xmm0
1101; X86-NEXT:    retl
1102;
1103; X64-LABEL: test_mm_i64gather_epi32:
1104; X64:       # %bb.0:
1105; X64-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1106; X64-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1107; X64-NEXT:    vpgatherqd %xmm2, (%rdi,%xmm0,2), %xmm1
1108; X64-NEXT:    vmovdqa %xmm1, %xmm0
1109; X64-NEXT:    retq
1110  %arg0 = bitcast i32 *%a0 to i8*
1111  %mask = bitcast <2 x i64> <i64 -1, i64 -1> to <4 x i32>
1112  %call = call <4 x i32> @llvm.x86.avx2.gather.q.d(<4 x i32> undef, i8* %arg0, <2 x i64> %a1, <4 x i32> %mask, i8 2)
1113  %bc = bitcast <4 x i32> %call to <2 x i64>
1114  ret <2 x i64> %bc
1115}
1116declare <4 x i32> @llvm.x86.avx2.gather.q.d(<4 x i32>, i8*, <2 x i64>, <4 x i32>, i8) nounwind readonly
1117
1118define <2 x i64> @test_mm_mask_i64gather_epi32(<2 x i64> %a0, i32 *%a1, <2 x i64> %a2, <2 x i64> %a3) {
1119; X86-LABEL: test_mm_mask_i64gather_epi32:
1120; X86:       # %bb.0:
1121; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1122; X86-NEXT:    vpgatherqd %xmm2, (%eax,%xmm1,2), %xmm0
1123; X86-NEXT:    retl
1124;
1125; X64-LABEL: test_mm_mask_i64gather_epi32:
1126; X64:       # %bb.0:
1127; X64-NEXT:    vpgatherqd %xmm2, (%rdi,%xmm1,2), %xmm0
1128; X64-NEXT:    retq
1129  %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
1130  %arg1 = bitcast i32 *%a1 to i8*
1131  %arg3 = bitcast <2 x i64> %a3 to <4 x i32>
1132  %call = call <4 x i32> @llvm.x86.avx2.gather.q.d(<4 x i32> %arg0, i8* %arg1, <2 x i64> %a2, <4 x i32> %arg3, i8 2)
1133  %bc = bitcast <4 x i32> %call to <2 x i64>
1134  ret <2 x i64> %bc
1135}
1136
1137define <2 x i64> @test_mm256_i64gather_epi32(i32 *%a0, <4 x i64> %a1) {
1138; X86-LABEL: test_mm256_i64gather_epi32:
1139; X86:       # %bb.0:
1140; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1141; X86-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1142; X86-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1143; X86-NEXT:    vpgatherqd %xmm2, (%eax,%ymm0,2), %xmm1
1144; X86-NEXT:    vmovdqa %xmm1, %xmm0
1145; X86-NEXT:    vzeroupper
1146; X86-NEXT:    retl
1147;
1148; X64-LABEL: test_mm256_i64gather_epi32:
1149; X64:       # %bb.0:
1150; X64-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1151; X64-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1152; X64-NEXT:    vpgatherqd %xmm2, (%rdi,%ymm0,2), %xmm1
1153; X64-NEXT:    vmovdqa %xmm1, %xmm0
1154; X64-NEXT:    vzeroupper
1155; X64-NEXT:    retq
1156  %arg0 = bitcast i32 *%a0 to i8*
1157  %mask = bitcast <2 x i64> <i64 -1, i64 -1> to <4 x i32>
1158  %call = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8* %arg0, <4 x i64> %a1, <4 x i32> %mask, i8 2)
1159  %bc = bitcast <4 x i32> %call to <2 x i64>
1160  ret <2 x i64> %bc
1161}
1162declare <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32>, i8*, <4 x i64>, <4 x i32>, i8) nounwind readonly
1163
1164define <2 x i64> @test_mm256_mask_i64gather_epi32(<2 x i64> %a0, i32 *%a1, <4 x i64> %a2, <2 x i64> %a3) {
1165; X86-LABEL: test_mm256_mask_i64gather_epi32:
1166; X86:       # %bb.0:
1167; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1168; X86-NEXT:    vpgatherqd %xmm2, (%eax,%ymm1,2), %xmm0
1169; X86-NEXT:    vzeroupper
1170; X86-NEXT:    retl
1171;
1172; X64-LABEL: test_mm256_mask_i64gather_epi32:
1173; X64:       # %bb.0:
1174; X64-NEXT:    vpgatherqd %xmm2, (%rdi,%ymm1,2), %xmm0
1175; X64-NEXT:    vzeroupper
1176; X64-NEXT:    retq
1177  %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
1178  %arg1 = bitcast i32 *%a1 to i8*
1179  %arg3 = bitcast <2 x i64> %a3 to <4 x i32>
1180  %call = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> %arg0, i8* %arg1, <4 x i64> %a2, <4 x i32> %arg3, i8 2)
1181  %bc = bitcast <4 x i32> %call to <2 x i64>
1182  ret <2 x i64> %bc
1183}
1184
1185define <2 x i64> @test_mm_i64gather_epi64(i64 *%a0, <2 x i64> %a1) {
1186; X86-LABEL: test_mm_i64gather_epi64:
1187; X86:       # %bb.0:
1188; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1189; X86-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1190; X86-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1191; X86-NEXT:    vpgatherqq %xmm2, (%eax,%xmm0,2), %xmm1
1192; X86-NEXT:    vmovdqa %xmm1, %xmm0
1193; X86-NEXT:    retl
1194;
1195; X64-LABEL: test_mm_i64gather_epi64:
1196; X64:       # %bb.0:
1197; X64-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1198; X64-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1199; X64-NEXT:    vpgatherqq %xmm2, (%rdi,%xmm0,2), %xmm1
1200; X64-NEXT:    vmovdqa %xmm1, %xmm0
1201; X64-NEXT:    retq
1202  %arg0 = bitcast i64 *%a0 to i8*
1203  %call = call <2 x i64> @llvm.x86.avx2.gather.q.q(<2 x i64> undef, i8* %arg0, <2 x i64> %a1, <2 x i64> <i64 -1, i64 -1>, i8 2)
1204  ret <2 x i64> %call
1205}
1206declare <2 x i64> @llvm.x86.avx2.gather.q.q(<2 x i64>, i8*, <2 x i64>, <2 x i64>, i8) nounwind readonly
1207
1208define <2 x i64> @test_mm_mask_i64gather_epi64(<2 x i64> %a0, i64 *%a1, <2 x i64> %a2, <2 x i64> %a3) {
1209; X86-LABEL: test_mm_mask_i64gather_epi64:
1210; X86:       # %bb.0:
1211; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1212; X86-NEXT:    vpgatherqq %xmm2, (%eax,%xmm1,2), %xmm0
1213; X86-NEXT:    retl
1214;
1215; X64-LABEL: test_mm_mask_i64gather_epi64:
1216; X64:       # %bb.0:
1217; X64-NEXT:    vpgatherqq %xmm2, (%rdi,%xmm1,2), %xmm0
1218; X64-NEXT:    retq
1219  %arg1 = bitcast i64 *%a1 to i8*
1220  %call = call <2 x i64> @llvm.x86.avx2.gather.q.q(<2 x i64> %a0, i8* %arg1, <2 x i64> %a2, <2 x i64> %a3, i8 2)
1221  ret <2 x i64> %call
1222}
1223
1224define <4 x i64> @test_mm256_i64gather_epi64(i64 *%a0, <4 x i64> %a1) {
1225; X86-LABEL: test_mm256_i64gather_epi64:
1226; X86:       # %bb.0:
1227; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1228; X86-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
1229; X86-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1230; X86-NEXT:    vpgatherqq %ymm2, (%eax,%ymm0,2), %ymm1
1231; X86-NEXT:    vmovdqa %ymm1, %ymm0
1232; X86-NEXT:    retl
1233;
1234; X64-LABEL: test_mm256_i64gather_epi64:
1235; X64:       # %bb.0:
1236; X64-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
1237; X64-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1238; X64-NEXT:    vpgatherqq %ymm2, (%rdi,%ymm0,2), %ymm1
1239; X64-NEXT:    vmovdqa %ymm1, %ymm0
1240; X64-NEXT:    retq
1241  %arg0 = bitcast i64 *%a0 to i8*
1242  %call = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8* %arg0, <4 x i64> %a1, <4 x i64> <i64 -1, i64 -1, i64 -1, i64 -1>, i8 2)
1243  ret <4 x i64> %call
1244}
1245declare <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64>, i8*, <4 x i64>, <4 x i64>, i8) nounwind readonly
1246
1247define <4 x i64> @test_mm256_mask_i64gather_epi64(<4 x i64> %a0, i64 *%a1, <4 x i64> %a2, <4 x i64> %a3) {
1248; X86-LABEL: test_mm256_mask_i64gather_epi64:
1249; X86:       # %bb.0:
1250; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1251; X86-NEXT:    vpgatherqq %ymm2, (%eax,%ymm1,2), %ymm0
1252; X86-NEXT:    retl
1253;
1254; X64-LABEL: test_mm256_mask_i64gather_epi64:
1255; X64:       # %bb.0:
1256; X64-NEXT:    vpgatherqq %ymm2, (%rdi,%ymm1,2), %ymm0
1257; X64-NEXT:    retq
1258  %arg1 = bitcast i64 *%a1 to i8*
1259  %call = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> %a0, i8* %arg1, <4 x i64> %a2, <4 x i64> %a3, i8 2)
1260  ret <4 x i64> %call
1261}
1262
1263define <2 x double> @test_mm_i64gather_pd(double *%a0, <2 x i64> %a1) {
1264; X86-LABEL: test_mm_i64gather_pd:
1265; X86:       # %bb.0:
1266; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1267; X86-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1268; X86-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
1269; X86-NEXT:    vgatherqpd %xmm2, (%eax,%xmm0,2), %xmm1
1270; X86-NEXT:    vmovapd %xmm1, %xmm0
1271; X86-NEXT:    retl
1272;
1273; X64-LABEL: test_mm_i64gather_pd:
1274; X64:       # %bb.0:
1275; X64-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1276; X64-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
1277; X64-NEXT:    vgatherqpd %xmm2, (%rdi,%xmm0,2), %xmm1
1278; X64-NEXT:    vmovapd %xmm1, %xmm0
1279; X64-NEXT:    retq
1280  %arg0 = bitcast double *%a0 to i8*
1281  %cmp = fcmp oeq <2 x double> zeroinitializer, zeroinitializer
1282  %sext = sext <2 x i1> %cmp to <2 x i64>
1283  %mask = bitcast <2 x i64> %sext to <2 x double>
1284  %call = call <2 x double> @llvm.x86.avx2.gather.q.pd(<2 x double> undef, i8* %arg0, <2 x i64> %a1, <2 x double> %mask, i8 2)
1285  ret <2 x double> %call
1286}
1287declare <2 x double> @llvm.x86.avx2.gather.q.pd(<2 x double>, i8*, <2 x i64>, <2 x double>, i8) nounwind readonly
1288
1289define <2 x double> @test_mm_mask_i64gather_pd(<2 x double> %a0, double *%a1, <2 x i64> %a2, <2 x double> %a3) {
1290; X86-LABEL: test_mm_mask_i64gather_pd:
1291; X86:       # %bb.0:
1292; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1293; X86-NEXT:    vgatherqpd %xmm2, (%eax,%xmm1,2), %xmm0
1294; X86-NEXT:    retl
1295;
1296; X64-LABEL: test_mm_mask_i64gather_pd:
1297; X64:       # %bb.0:
1298; X64-NEXT:    vgatherqpd %xmm2, (%rdi,%xmm1,2), %xmm0
1299; X64-NEXT:    retq
1300  %arg1 = bitcast double *%a1 to i8*
1301  %call = call <2 x double> @llvm.x86.avx2.gather.q.pd(<2 x double> %a0, i8* %arg1, <2 x i64> %a2, <2 x double> %a3, i8 2)
1302  ret <2 x double> %call
1303}
1304
1305define <4 x double> @test_mm256_i64gather_pd(double *%a0, <4 x i64> %a1) {
1306; X86-LABEL: test_mm256_i64gather_pd:
1307; X86:       # %bb.0:
1308; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1309; X86-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
1310; X86-NEXT:    vcmpeqpd %ymm1, %ymm1, %ymm2
1311; X86-NEXT:    vgatherqpd %ymm2, (%eax,%ymm0,2), %ymm1
1312; X86-NEXT:    vmovapd %ymm1, %ymm0
1313; X86-NEXT:    retl
1314;
1315; X64-LABEL: test_mm256_i64gather_pd:
1316; X64:       # %bb.0:
1317; X64-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
1318; X64-NEXT:    vcmpeqpd %ymm1, %ymm1, %ymm2
1319; X64-NEXT:    vgatherqpd %ymm2, (%rdi,%ymm0,2), %ymm1
1320; X64-NEXT:    vmovapd %ymm1, %ymm0
1321; X64-NEXT:    retq
1322  %arg0 = bitcast double *%a0 to i8*
1323  %mask = call <4 x double> @llvm.x86.avx.cmp.pd.256(<4 x double> zeroinitializer, <4 x double> zeroinitializer, i8 0)
1324  %call = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8* %arg0, <4 x i64> %a1, <4 x double> %mask, i8 2)
1325  ret <4 x double> %call
1326}
1327declare <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double>, i8*, <4 x i64>, <4 x double>, i8) nounwind readonly
1328
1329define <4 x double> @test_mm256_mask_i64gather_pd(<4 x double> %a0, i64 *%a1, <4 x i64> %a2, <4 x double> %a3) {
1330; X86-LABEL: test_mm256_mask_i64gather_pd:
1331; X86:       # %bb.0:
1332; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1333; X86-NEXT:    vgatherqpd %ymm2, (%eax,%ymm1,2), %ymm0
1334; X86-NEXT:    retl
1335;
1336; X64-LABEL: test_mm256_mask_i64gather_pd:
1337; X64:       # %bb.0:
1338; X64-NEXT:    vgatherqpd %ymm2, (%rdi,%ymm1,2), %ymm0
1339; X64-NEXT:    retq
1340  %arg1 = bitcast i64 *%a1 to i8*
1341  %call = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> %a0, i8* %arg1, <4 x i64> %a2, <4 x double> %a3, i8 2)
1342  ret <4 x double> %call
1343}
1344
1345define <4 x float> @test_mm_i64gather_ps(float *%a0, <2 x i64> %a1) {
1346; X86-LABEL: test_mm_i64gather_ps:
1347; X86:       # %bb.0:
1348; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1349; X86-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1350; X86-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1351; X86-NEXT:    vgatherqps %xmm2, (%eax,%xmm0,2), %xmm1
1352; X86-NEXT:    vmovaps %xmm1, %xmm0
1353; X86-NEXT:    retl
1354;
1355; X64-LABEL: test_mm_i64gather_ps:
1356; X64:       # %bb.0:
1357; X64-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1358; X64-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1359; X64-NEXT:    vgatherqps %xmm2, (%rdi,%xmm0,2), %xmm1
1360; X64-NEXT:    vmovaps %xmm1, %xmm0
1361; X64-NEXT:    retq
1362  %arg0 = bitcast float *%a0 to i8*
1363  %cmp = fcmp oeq <4 x float> zeroinitializer, zeroinitializer
1364  %sext = sext <4 x i1> %cmp to <4 x i32>
1365  %mask = bitcast <4 x i32> %sext to <4 x float>
1366  %call = call <4 x float> @llvm.x86.avx2.gather.q.ps(<4 x float> undef, i8* %arg0, <2 x i64> %a1, <4 x float> %mask, i8 2)
1367  ret <4 x float> %call
1368}
1369declare <4 x float> @llvm.x86.avx2.gather.q.ps(<4 x float>, i8*, <2 x i64>, <4 x float>, i8) nounwind readonly
1370
1371define <4 x float> @test_mm_mask_i64gather_ps(<4 x float> %a0, float *%a1, <2 x i64> %a2, <4 x float> %a3) {
1372; X86-LABEL: test_mm_mask_i64gather_ps:
1373; X86:       # %bb.0:
1374; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1375; X86-NEXT:    vgatherqps %xmm2, (%eax,%xmm1,2), %xmm0
1376; X86-NEXT:    retl
1377;
1378; X64-LABEL: test_mm_mask_i64gather_ps:
1379; X64:       # %bb.0:
1380; X64-NEXT:    vgatherqps %xmm2, (%rdi,%xmm1,2), %xmm0
1381; X64-NEXT:    retq
1382  %arg1 = bitcast float *%a1 to i8*
1383  %call = call <4 x float> @llvm.x86.avx2.gather.q.ps(<4 x float> %a0, i8* %arg1, <2 x i64> %a2, <4 x float> %a3, i8 2)
1384  ret <4 x float> %call
1385}
1386
1387define <4 x float> @test_mm256_i64gather_ps(float *%a0, <4 x i64> %a1) {
1388; X86-LABEL: test_mm256_i64gather_ps:
1389; X86:       # %bb.0:
1390; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1391; X86-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1392; X86-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1393; X86-NEXT:    vgatherqps %xmm2, (%eax,%ymm0,2), %xmm1
1394; X86-NEXT:    vmovaps %xmm1, %xmm0
1395; X86-NEXT:    vzeroupper
1396; X86-NEXT:    retl
1397;
1398; X64-LABEL: test_mm256_i64gather_ps:
1399; X64:       # %bb.0:
1400; X64-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1401; X64-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1402; X64-NEXT:    vgatherqps %xmm2, (%rdi,%ymm0,2), %xmm1
1403; X64-NEXT:    vmovaps %xmm1, %xmm0
1404; X64-NEXT:    vzeroupper
1405; X64-NEXT:    retq
1406  %arg0 = bitcast float *%a0 to i8*
1407  %cmp = fcmp oeq <4 x float> zeroinitializer, zeroinitializer
1408  %sext = sext <4 x i1> %cmp to <4 x i32>
1409  %mask = bitcast <4 x i32> %sext to <4 x float>
1410  %call = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8* %arg0, <4 x i64> %a1, <4 x float> %mask, i8 2)
1411  ret <4 x float> %call
1412}
1413declare <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float>, i8*, <4 x i64>, <4 x float>, i8) nounwind readonly
1414
1415define <4 x float> @test_mm256_mask_i64gather_ps(<4 x float> %a0, float *%a1, <4 x i64> %a2, <4 x float> %a3) {
1416; X86-LABEL: test_mm256_mask_i64gather_ps:
1417; X86:       # %bb.0:
1418; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1419; X86-NEXT:    vgatherqps %xmm2, (%eax,%ymm1,2), %xmm0
1420; X86-NEXT:    vzeroupper
1421; X86-NEXT:    retl
1422;
1423; X64-LABEL: test_mm256_mask_i64gather_ps:
1424; X64:       # %bb.0:
1425; X64-NEXT:    vgatherqps %xmm2, (%rdi,%ymm1,2), %xmm0
1426; X64-NEXT:    vzeroupper
1427; X64-NEXT:    retq
1428  %arg1 = bitcast float *%a1 to i8*
1429  %call = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> %a0, i8* %arg1, <4 x i64> %a2, <4 x float> %a3, i8 2)
1430  ret <4 x float> %call
1431}
1432
1433define <4 x i64> @test0_mm256_inserti128_si256(<4 x i64> %a0, <2 x i64> %a1) nounwind {
1434; CHECK-LABEL: test0_mm256_inserti128_si256:
1435; CHECK:       # %bb.0:
1436; CHECK-NEXT:    # kill: def $xmm1 killed $xmm1 def $ymm1
1437; CHECK-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
1438; CHECK-NEXT:    ret{{[l|q]}}
1439  %ext = shufflevector <2 x i64> %a1, <2 x i64> %a1, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
1440  %res = shufflevector <4 x i64> %a0, <4 x i64> %ext, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
1441  ret <4 x i64> %res
1442}
1443
1444define <4 x i64> @test1_mm256_inserti128_si256(<4 x i64> %a0, <2 x i64> %a1) nounwind {
1445; CHECK-LABEL: test1_mm256_inserti128_si256:
1446; CHECK:       # %bb.0:
1447; CHECK-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1448; CHECK-NEXT:    ret{{[l|q]}}
1449  %ext = shufflevector <2 x i64> %a1, <2 x i64> %a1, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
1450  %res = shufflevector <4 x i64> %a0, <4 x i64> %ext, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
1451  ret <4 x i64> %res
1452}
1453
1454define <4 x i64> @test_mm256_madd_epi16(<4 x i64> %a0, <4 x i64> %a1) {
1455; CHECK-LABEL: test_mm256_madd_epi16:
1456; CHECK:       # %bb.0:
1457; CHECK-NEXT:    vpmaddwd %ymm1, %ymm0, %ymm0
1458; CHECK-NEXT:    ret{{[l|q]}}
1459  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
1460  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
1461  %res = call <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16> %arg0, <16 x i16> %arg1)
1462  %bc = bitcast <8 x i32> %res to <4 x i64>
1463  ret <4 x i64> %bc
1464}
1465declare <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16>, <16 x i16>) nounwind readnone
1466
1467define <4 x i64> @test_mm256_maddubs_epi16(<4 x i64> %a0, <4 x i64> %a1) {
1468; CHECK-LABEL: test_mm256_maddubs_epi16:
1469; CHECK:       # %bb.0:
1470; CHECK-NEXT:    vpmaddubsw %ymm1, %ymm0, %ymm0
1471; CHECK-NEXT:    ret{{[l|q]}}
1472  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
1473  %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
1474  %res = call <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8> %arg0, <32 x i8> %arg1)
1475  %bc = bitcast <16 x i16> %res to <4 x i64>
1476  ret <4 x i64> %bc
1477}
1478declare <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8>, <32 x i8>) nounwind readnone
1479
1480define <2 x i64> @test_mm_maskload_epi32(i32* %a0, <2 x i64> %a1) nounwind {
1481; X86-LABEL: test_mm_maskload_epi32:
1482; X86:       # %bb.0:
1483; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1484; X86-NEXT:    vpmaskmovd (%eax), %xmm0, %xmm0
1485; X86-NEXT:    retl
1486;
1487; X64-LABEL: test_mm_maskload_epi32:
1488; X64:       # %bb.0:
1489; X64-NEXT:    vpmaskmovd (%rdi), %xmm0, %xmm0
1490; X64-NEXT:    retq
1491  %arg0 = bitcast i32* %a0 to i8*
1492  %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
1493  %call = call <4 x i32> @llvm.x86.avx2.maskload.d(i8* %arg0, <4 x i32> %arg1)
1494  %bc = bitcast <4 x i32> %call to <2 x i64>
1495  ret <2 x i64> %bc
1496}
1497declare <4 x i32> @llvm.x86.avx2.maskload.d(i8*, <4 x i32>) nounwind readonly
1498
1499define <4 x i64> @test_mm256_maskload_epi32(i32* %a0, <4 x i64> %a1) nounwind {
1500; X86-LABEL: test_mm256_maskload_epi32:
1501; X86:       # %bb.0:
1502; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1503; X86-NEXT:    vpmaskmovd (%eax), %ymm0, %ymm0
1504; X86-NEXT:    retl
1505;
1506; X64-LABEL: test_mm256_maskload_epi32:
1507; X64:       # %bb.0:
1508; X64-NEXT:    vpmaskmovd (%rdi), %ymm0, %ymm0
1509; X64-NEXT:    retq
1510  %arg0 = bitcast i32* %a0 to i8*
1511  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1512  %call = call <8 x i32> @llvm.x86.avx2.maskload.d.256(i8* %arg0, <8 x i32> %arg1)
1513  %bc = bitcast <8 x i32> %call to <4 x i64>
1514  ret <4 x i64> %bc
1515}
1516declare <8 x i32> @llvm.x86.avx2.maskload.d.256(i8*, <8 x i32>) nounwind readonly
1517
1518define <2 x i64> @test_mm_maskload_epi64(i64* %a0, <2 x i64> %a1) nounwind {
1519; X86-LABEL: test_mm_maskload_epi64:
1520; X86:       # %bb.0:
1521; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1522; X86-NEXT:    vpmaskmovq (%eax), %xmm0, %xmm0
1523; X86-NEXT:    retl
1524;
1525; X64-LABEL: test_mm_maskload_epi64:
1526; X64:       # %bb.0:
1527; X64-NEXT:    vpmaskmovq (%rdi), %xmm0, %xmm0
1528; X64-NEXT:    retq
1529  %arg0 = bitcast i64* %a0 to i8*
1530  %res = call <2 x i64> @llvm.x86.avx2.maskload.q(i8* %arg0, <2 x i64> %a1)
1531  ret <2 x i64> %res
1532}
1533declare <2 x i64> @llvm.x86.avx2.maskload.q(i8*, <2 x i64>) nounwind readonly
1534
1535define <4 x i64> @test_mm256_maskload_epi64(i64* %a0, <4 x i64> %a1) nounwind {
1536; X86-LABEL: test_mm256_maskload_epi64:
1537; X86:       # %bb.0:
1538; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1539; X86-NEXT:    vpmaskmovq (%eax), %ymm0, %ymm0
1540; X86-NEXT:    retl
1541;
1542; X64-LABEL: test_mm256_maskload_epi64:
1543; X64:       # %bb.0:
1544; X64-NEXT:    vpmaskmovq (%rdi), %ymm0, %ymm0
1545; X64-NEXT:    retq
1546  %arg0 = bitcast i64* %a0 to i8*
1547  %res = call <4 x i64> @llvm.x86.avx2.maskload.q.256(i8* %arg0, <4 x i64> %a1)
1548  ret <4 x i64> %res
1549}
1550declare <4 x i64> @llvm.x86.avx2.maskload.q.256(i8*, <4 x i64>) nounwind readonly
1551
1552define void @test_mm_maskstore_epi32(float* %a0, <2 x i64> %a1, <2 x i64> %a2) nounwind {
1553; X86-LABEL: test_mm_maskstore_epi32:
1554; X86:       # %bb.0:
1555; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1556; X86-NEXT:    vpmaskmovd %xmm1, %xmm0, (%eax)
1557; X86-NEXT:    retl
1558;
1559; X64-LABEL: test_mm_maskstore_epi32:
1560; X64:       # %bb.0:
1561; X64-NEXT:    vpmaskmovd %xmm1, %xmm0, (%rdi)
1562; X64-NEXT:    retq
1563  %arg0 = bitcast float* %a0 to i8*
1564  %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
1565  %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
1566  call void @llvm.x86.avx2.maskstore.d(i8* %arg0, <4 x i32> %arg1, <4 x i32> %arg2)
1567  ret void
1568}
1569declare void @llvm.x86.avx2.maskstore.d(i8*, <4 x i32>, <4 x i32>) nounwind readnone
1570
1571define void @test_mm256_maskstore_epi32(float* %a0, <4 x i64> %a1, <4 x i64> %a2) nounwind {
1572; X86-LABEL: test_mm256_maskstore_epi32:
1573; X86:       # %bb.0:
1574; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1575; X86-NEXT:    vpmaskmovd %ymm1, %ymm0, (%eax)
1576; X86-NEXT:    vzeroupper
1577; X86-NEXT:    retl
1578;
1579; X64-LABEL: test_mm256_maskstore_epi32:
1580; X64:       # %bb.0:
1581; X64-NEXT:    vpmaskmovd %ymm1, %ymm0, (%rdi)
1582; X64-NEXT:    vzeroupper
1583; X64-NEXT:    retq
1584  %arg0 = bitcast float* %a0 to i8*
1585  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1586  %arg2 = bitcast <4 x i64> %a2 to <8 x i32>
1587  call void @llvm.x86.avx2.maskstore.d.256(i8* %arg0, <8 x i32> %arg1, <8 x i32> %arg2)
1588  ret void
1589}
1590declare void @llvm.x86.avx2.maskstore.d.256(i8*, <8 x i32>, <8 x i32>) nounwind readnone
1591
1592define void @test_mm_maskstore_epi64(i64* %a0, <2 x i64> %a1, <2 x i64> %a2) nounwind {
1593; X86-LABEL: test_mm_maskstore_epi64:
1594; X86:       # %bb.0:
1595; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1596; X86-NEXT:    vpmaskmovq %xmm1, %xmm0, (%eax)
1597; X86-NEXT:    retl
1598;
1599; X64-LABEL: test_mm_maskstore_epi64:
1600; X64:       # %bb.0:
1601; X64-NEXT:    vpmaskmovq %xmm1, %xmm0, (%rdi)
1602; X64-NEXT:    retq
1603  %arg0 = bitcast i64* %a0 to i8*
1604  call void @llvm.x86.avx2.maskstore.q(i8* %arg0, <2 x i64> %a1, <2 x i64> %a2)
1605  ret void
1606}
1607declare void @llvm.x86.avx2.maskstore.q(i8*, <2 x i64>, <2 x i64>) nounwind readnone
1608
1609define void @test_mm256_maskstore_epi64(i64* %a0, <4 x i64> %a1, <4 x i64> %a2) nounwind {
1610; X86-LABEL: test_mm256_maskstore_epi64:
1611; X86:       # %bb.0:
1612; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1613; X86-NEXT:    vpmaskmovq %ymm1, %ymm0, (%eax)
1614; X86-NEXT:    vzeroupper
1615; X86-NEXT:    retl
1616;
1617; X64-LABEL: test_mm256_maskstore_epi64:
1618; X64:       # %bb.0:
1619; X64-NEXT:    vpmaskmovq %ymm1, %ymm0, (%rdi)
1620; X64-NEXT:    vzeroupper
1621; X64-NEXT:    retq
1622  %arg0 = bitcast i64* %a0 to i8*
1623  call void @llvm.x86.avx2.maskstore.q.256(i8* %arg0, <4 x i64> %a1, <4 x i64> %a2)
1624  ret void
1625}
1626declare void @llvm.x86.avx2.maskstore.q.256(i8*, <4 x i64>, <4 x i64>) nounwind readnone
1627
1628define <4 x i64> @test_mm256_max_epi8(<4 x i64> %a0, <4 x i64> %a1) {
1629; CHECK-LABEL: test_mm256_max_epi8:
1630; CHECK:       # %bb.0:
1631; CHECK-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0
1632; CHECK-NEXT:    ret{{[l|q]}}
1633  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
1634  %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
1635  %sel = call <32 x i8> @llvm.smax.v32i8(<32 x i8> %arg0, <32 x i8> %arg1)
1636  %bc = bitcast <32 x i8> %sel to <4 x i64>
1637  ret <4 x i64> %bc
1638}
1639declare <32 x i8> @llvm.smax.v32i8(<32 x i8>, <32 x i8>)
1640
1641define <4 x i64> @test_mm256_max_epi16(<4 x i64> %a0, <4 x i64> %a1) {
1642; CHECK-LABEL: test_mm256_max_epi16:
1643; CHECK:       # %bb.0:
1644; CHECK-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm0
1645; CHECK-NEXT:    ret{{[l|q]}}
1646  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
1647  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
1648  %sel = call <16 x i16> @llvm.smax.v16i16(<16 x i16> %arg0, <16 x i16> %arg1)
1649  %bc = bitcast <16 x i16> %sel to <4 x i64>
1650  ret <4 x i64> %bc
1651}
1652declare <16 x i16> @llvm.smax.v16i16(<16 x i16>, <16 x i16>)
1653
1654define <4 x i64> @test_mm256_max_epi32(<4 x i64> %a0, <4 x i64> %a1) {
1655; CHECK-LABEL: test_mm256_max_epi32:
1656; CHECK:       # %bb.0:
1657; CHECK-NEXT:    vpmaxsd %ymm1, %ymm0, %ymm0
1658; CHECK-NEXT:    ret{{[l|q]}}
1659  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
1660  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1661  %sel = call <8 x i32> @llvm.smax.v8i32(<8 x i32> %arg0, <8 x i32> %arg1)
1662  %bc = bitcast <8 x i32> %sel to <4 x i64>
1663  ret <4 x i64> %bc
1664}
1665declare <8 x i32> @llvm.smax.v8i32(<8 x i32>, <8 x i32>)
1666
1667define <4 x i64> @test_mm256_max_epu8(<4 x i64> %a0, <4 x i64> %a1) {
1668; CHECK-LABEL: test_mm256_max_epu8:
1669; CHECK:       # %bb.0:
1670; CHECK-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0
1671; CHECK-NEXT:    ret{{[l|q]}}
1672  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
1673  %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
1674  %sel = call <32 x i8> @llvm.umax.v32i8(<32 x i8> %arg0, <32 x i8> %arg1)
1675  %bc = bitcast <32 x i8> %sel to <4 x i64>
1676  ret <4 x i64> %bc
1677}
1678declare <32 x i8> @llvm.umax.v32i8(<32 x i8>, <32 x i8>)
1679
1680define <4 x i64> @test_mm256_max_epu16(<4 x i64> %a0, <4 x i64> %a1) {
1681; CHECK-LABEL: test_mm256_max_epu16:
1682; CHECK:       # %bb.0:
1683; CHECK-NEXT:    vpmaxuw %ymm1, %ymm0, %ymm0
1684; CHECK-NEXT:    ret{{[l|q]}}
1685  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
1686  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
1687  %sel = call <16 x i16> @llvm.umax.v16i16(<16 x i16> %arg0, <16 x i16> %arg1)
1688  %bc = bitcast <16 x i16> %sel to <4 x i64>
1689  ret <4 x i64> %bc
1690}
1691declare <16 x i16> @llvm.umax.v16i16(<16 x i16>, <16 x i16>)
1692
1693define <4 x i64> @test_mm256_max_epu32(<4 x i64> %a0, <4 x i64> %a1) {
1694; CHECK-LABEL: test_mm256_max_epu32:
1695; CHECK:       # %bb.0:
1696; CHECK-NEXT:    vpmaxud %ymm1, %ymm0, %ymm0
1697; CHECK-NEXT:    ret{{[l|q]}}
1698  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
1699  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1700  %sel = call <8 x i32> @llvm.umax.v8i32(<8 x i32> %arg0, <8 x i32> %arg1)
1701  %bc = bitcast <8 x i32> %sel to <4 x i64>
1702  ret <4 x i64> %bc
1703}
1704declare <8 x i32> @llvm.umax.v8i32(<8 x i32>, <8 x i32>)
1705
1706define <4 x i64> @test_mm256_min_epi8(<4 x i64> %a0, <4 x i64> %a1) {
1707; CHECK-LABEL: test_mm256_min_epi8:
1708; CHECK:       # %bb.0:
1709; CHECK-NEXT:    vpminsb %ymm1, %ymm0, %ymm0
1710; CHECK-NEXT:    ret{{[l|q]}}
1711  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
1712  %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
1713  %sel = call <32 x i8> @llvm.smin.v32i8(<32 x i8> %arg0, <32 x i8> %arg1)
1714  %bc = bitcast <32 x i8> %sel to <4 x i64>
1715  ret <4 x i64> %bc
1716}
1717declare <32 x i8> @llvm.smin.v32i8(<32 x i8>, <32 x i8>)
1718
1719define <4 x i64> @test_mm256_min_epi16(<4 x i64> %a0, <4 x i64> %a1) {
1720; CHECK-LABEL: test_mm256_min_epi16:
1721; CHECK:       # %bb.0:
1722; CHECK-NEXT:    vpminsw %ymm1, %ymm0, %ymm0
1723; CHECK-NEXT:    ret{{[l|q]}}
1724  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
1725  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
1726  %sel = call <16 x i16> @llvm.smin.v16i16(<16 x i16> %arg0, <16 x i16> %arg1)
1727  %bc = bitcast <16 x i16> %sel to <4 x i64>
1728  ret <4 x i64> %bc
1729}
1730declare <16 x i16> @llvm.smin.v16i16(<16 x i16>, <16 x i16>)
1731
1732define <4 x i64> @test_mm256_min_epi32(<4 x i64> %a0, <4 x i64> %a1) {
1733; CHECK-LABEL: test_mm256_min_epi32:
1734; CHECK:       # %bb.0:
1735; CHECK-NEXT:    vpminsd %ymm1, %ymm0, %ymm0
1736; CHECK-NEXT:    ret{{[l|q]}}
1737  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
1738  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1739  %sel = call <8 x i32> @llvm.smin.v8i32(<8 x i32> %arg0, <8 x i32> %arg1)
1740  %bc = bitcast <8 x i32> %sel to <4 x i64>
1741  ret <4 x i64> %bc
1742}
1743declare <8 x i32> @llvm.smin.v8i32(<8 x i32>, <8 x i32>)
1744
1745define <4 x i64> @test_mm256_min_epu8(<4 x i64> %a0, <4 x i64> %a1) {
1746; CHECK-LABEL: test_mm256_min_epu8:
1747; CHECK:       # %bb.0:
1748; CHECK-NEXT:    vpminub %ymm1, %ymm0, %ymm0
1749; CHECK-NEXT:    ret{{[l|q]}}
1750  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
1751  %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
1752  %sel = call <32 x i8> @llvm.umin.v32i8(<32 x i8> %arg0, <32 x i8> %arg1)
1753  %bc = bitcast <32 x i8> %sel to <4 x i64>
1754  ret <4 x i64> %bc
1755}
1756declare <32 x i8> @llvm.umin.v32i8(<32 x i8>, <32 x i8>)
1757
1758define <4 x i64> @test_mm256_min_epu16(<4 x i64> %a0, <4 x i64> %a1) {
1759; CHECK-LABEL: test_mm256_min_epu16:
1760; CHECK:       # %bb.0:
1761; CHECK-NEXT:    vpminuw %ymm1, %ymm0, %ymm0
1762; CHECK-NEXT:    ret{{[l|q]}}
1763  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
1764  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
1765  %sel = call <16 x i16> @llvm.umin.v16i16(<16 x i16> %arg0, <16 x i16> %arg1)
1766  %bc = bitcast <16 x i16> %sel to <4 x i64>
1767  ret <4 x i64> %bc
1768}
1769declare <16 x i16> @llvm.umin.v16i16(<16 x i16>, <16 x i16>)
1770
1771define <4 x i64> @test_mm256_min_epu32(<4 x i64> %a0, <4 x i64> %a1) {
1772; CHECK-LABEL: test_mm256_min_epu32:
1773; CHECK:       # %bb.0:
1774; CHECK-NEXT:    vpminud %ymm1, %ymm0, %ymm0
1775; CHECK-NEXT:    ret{{[l|q]}}
1776  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
1777  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1778  %sel = call <8 x i32> @llvm.umin.v8i32(<8 x i32> %arg0, <8 x i32> %arg1)
1779  %bc = bitcast <8 x i32> %sel to <4 x i64>
1780  ret <4 x i64> %bc
1781}
1782declare <8 x i32> @llvm.umin.v8i32(<8 x i32>, <8 x i32>)
1783
1784define i32 @test_mm256_movemask_epi8(<4 x i64> %a0) nounwind {
1785; CHECK-LABEL: test_mm256_movemask_epi8:
1786; CHECK:       # %bb.0:
1787; CHECK-NEXT:    vpmovmskb %ymm0, %eax
1788; CHECK-NEXT:    vzeroupper
1789; CHECK-NEXT:    ret{{[l|q]}}
1790  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
1791  %res = call i32 @llvm.x86.avx2.pmovmskb(<32 x i8> %arg0)
1792  ret i32 %res
1793}
1794declare i32 @llvm.x86.avx2.pmovmskb(<32 x i8>) nounwind readnone
1795
1796define <4 x i64> @test_mm256_mpsadbw_epu8(<4 x i64> %a0, <4 x i64> %a1) {
1797; CHECK-LABEL: test_mm256_mpsadbw_epu8:
1798; CHECK:       # %bb.0:
1799; CHECK-NEXT:    vmpsadbw $3, %ymm1, %ymm0, %ymm0
1800; CHECK-NEXT:    ret{{[l|q]}}
1801  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
1802  %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
1803  %call = call <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8> %arg0, <32 x i8> %arg1, i8 3)
1804  %bc = bitcast <16 x i16>  %call to <4 x i64>
1805  ret <4 x i64> %bc
1806}
1807declare <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8>, <32 x i8>, i8) nounwind readnone
1808
1809define <4 x i64> @test_mm256_mul_epi32(<4 x i64> %a0, <4 x i64> %a1) {
1810; CHECK-LABEL: test_mm256_mul_epi32:
1811; CHECK:       # %bb.0:
1812; CHECK-NEXT:    vpmuldq %ymm1, %ymm0, %ymm0
1813; CHECK-NEXT:    ret{{[l|q]}}
1814  %A = shl <4 x i64> %a0, <i64 32, i64 32, i64 32, i64 32>
1815  %A1 = ashr exact <4 x i64> %A, <i64 32, i64 32, i64 32, i64 32>
1816  %B = shl <4 x i64> %a1, <i64 32, i64 32, i64 32, i64 32>
1817  %B1 = ashr exact <4 x i64> %B, <i64 32, i64 32, i64 32, i64 32>
1818  %res = mul nsw <4 x i64> %A1, %B1
1819  ret <4 x i64> %res
1820}
1821declare <4 x i64> @llvm.x86.avx2.pmul.dq(<8 x i32>, <8 x i32>) nounwind readnone
1822
1823define <4 x i64> @test_mm256_mul_epu32(<4 x i64> %a0, <4 x i64> %a1) {
1824; CHECK-LABEL: test_mm256_mul_epu32:
1825; CHECK:       # %bb.0:
1826; CHECK-NEXT:    vpmuludq %ymm1, %ymm0, %ymm0
1827; CHECK-NEXT:    ret{{[l|q]}}
1828  %A = and <4 x i64> %a0, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
1829  %B = and <4 x i64> %a1, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
1830  %res = mul nuw <4 x i64> %A, %B
1831  ret <4 x i64> %res
1832}
1833declare <4 x i64> @llvm.x86.avx2.pmulu.dq(<8 x i32>, <8 x i32>) nounwind readnone
1834
1835define <4 x i64> @test_mm256_mulhi_epi16(<4 x i64> %a0, <4 x i64> %a1) {
1836; CHECK-LABEL: test_mm256_mulhi_epi16:
1837; CHECK:       # %bb.0:
1838; CHECK-NEXT:    vpmulhw %ymm1, %ymm0, %ymm0
1839; CHECK-NEXT:    ret{{[l|q]}}
1840  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
1841  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
1842  %res = call <16 x i16> @llvm.x86.avx2.pmulh.w(<16 x i16> %arg0, <16 x i16> %arg1)
1843  %bc = bitcast <16 x i16> %res to <4 x i64>
1844  ret <4 x i64> %bc
1845}
1846declare <16 x i16> @llvm.x86.avx2.pmulh.w(<16 x i16>, <16 x i16>) nounwind readnone
1847
1848define <4 x i64> @test_mm256_mulhi_epu16(<4 x i64> %a0, <4 x i64> %a1) {
1849; CHECK-LABEL: test_mm256_mulhi_epu16:
1850; CHECK:       # %bb.0:
1851; CHECK-NEXT:    vpmulhuw %ymm1, %ymm0, %ymm0
1852; CHECK-NEXT:    ret{{[l|q]}}
1853  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
1854  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
1855  %res = call <16 x i16> @llvm.x86.avx2.pmulhu.w(<16 x i16> %arg0, <16 x i16> %arg1)
1856  %bc = bitcast <16 x i16> %res to <4 x i64>
1857  ret <4 x i64> %bc
1858}
1859declare <16 x i16> @llvm.x86.avx2.pmulhu.w(<16 x i16>, <16 x i16>) nounwind readnone
1860
1861define <4 x i64> @test_mm256_mulhrs_epi16(<4 x i64> %a0, <4 x i64> %a1) {
1862; CHECK-LABEL: test_mm256_mulhrs_epi16:
1863; CHECK:       # %bb.0:
1864; CHECK-NEXT:    vpmulhrsw %ymm1, %ymm0, %ymm0
1865; CHECK-NEXT:    ret{{[l|q]}}
1866  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
1867  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
1868  %res = call <16 x i16> @llvm.x86.avx2.pmul.hr.sw(<16 x i16> %arg0, <16 x i16> %arg1)
1869  %bc = bitcast <16 x i16> %res to <4 x i64>
1870  ret <4 x i64> %bc
1871}
1872declare <16 x i16> @llvm.x86.avx2.pmul.hr.sw(<16 x i16>, <16 x i16>) nounwind readnone
1873
1874define <4 x i64> @test_mm256_mullo_epi16(<4 x i64> %a0, <4 x i64> %a1) {
1875; CHECK-LABEL: test_mm256_mullo_epi16:
1876; CHECK:       # %bb.0:
1877; CHECK-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
1878; CHECK-NEXT:    ret{{[l|q]}}
1879  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
1880  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
1881  %res = mul <16 x i16> %arg0, %arg1
1882  %bc = bitcast <16 x i16> %res to <4 x i64>
1883  ret <4 x i64> %bc
1884}
1885
1886define <4 x i64> @test_mm256_mullo_epi32(<4 x i64> %a0, <4 x i64> %a1) {
1887; CHECK-LABEL: test_mm256_mullo_epi32:
1888; CHECK:       # %bb.0:
1889; CHECK-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
1890; CHECK-NEXT:    ret{{[l|q]}}
1891  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
1892  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1893  %res = mul <8 x i32> %arg0, %arg1
1894  %bc = bitcast <8 x i32> %res to <4 x i64>
1895  ret <4 x i64> %bc
1896}
1897
1898define <4 x i64> @test_mm256_or_si256(<4 x i64> %a0, <4 x i64> %a1) nounwind {
1899; CHECK-LABEL: test_mm256_or_si256:
1900; CHECK:       # %bb.0:
1901; CHECK-NEXT:    vorps %ymm1, %ymm0, %ymm0
1902; CHECK-NEXT:    ret{{[l|q]}}
1903  %res = or <4 x i64> %a0, %a1
1904  ret <4 x i64> %res
1905}
1906
1907define <4 x i64> @test_mm256_packs_epi16(<4 x i64> %a0, <4 x i64> %a1) {
1908; CHECK-LABEL: test_mm256_packs_epi16:
1909; CHECK:       # %bb.0:
1910; CHECK-NEXT:    vpacksswb %ymm1, %ymm0, %ymm0
1911; CHECK-NEXT:    ret{{[l|q]}}
1912  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
1913  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
1914  %call = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %arg0, <16 x i16> %arg1)
1915  %res = bitcast <32 x i8> %call to <4 x i64>
1916  ret <4 x i64> %res
1917}
1918declare <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16>, <16 x i16>) nounwind readnone
1919
1920define <4 x i64> @test_mm256_packs_epi32(<4 x i64> %a0, <4 x i64> %a1) {
1921; CHECK-LABEL: test_mm256_packs_epi32:
1922; CHECK:       # %bb.0:
1923; CHECK-NEXT:    vpackssdw %ymm1, %ymm0, %ymm0
1924; CHECK-NEXT:    ret{{[l|q]}}
1925  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
1926  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1927  %call = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %arg0, <8 x i32> %arg1)
1928  %res = bitcast <16 x i16> %call to <4 x i64>
1929  ret <4 x i64> %res
1930}
1931declare <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32>, <8 x i32>) nounwind readnone
1932
1933define <4 x i64> @test_mm256_packus_epi16(<4 x i64> %a0, <4 x i64> %a1) {
1934; CHECK-LABEL: test_mm256_packus_epi16:
1935; CHECK:       # %bb.0:
1936; CHECK-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0
1937; CHECK-NEXT:    ret{{[l|q]}}
1938  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
1939  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
1940  %call = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> %arg0, <16 x i16> %arg1)
1941  %res = bitcast <32 x i8> %call to <4 x i64>
1942  ret <4 x i64> %res
1943}
1944declare <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16>, <16 x i16>) nounwind readnone
1945
1946define <4 x i64> @test_mm256_packus_epi32(<4 x i64> %a0, <4 x i64> %a1) {
1947; CHECK-LABEL: test_mm256_packus_epi32:
1948; CHECK:       # %bb.0:
1949; CHECK-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
1950; CHECK-NEXT:    ret{{[l|q]}}
1951  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
1952  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1953  %call = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %arg0, <8 x i32> %arg1)
1954  %res = bitcast <16 x i16> %call to <4 x i64>
1955  ret <4 x i64> %res
1956}
1957declare <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32>, <8 x i32>) nounwind readnone
1958
1959define <4 x i64> @test_mm256_permute2x128_si256(<4 x i64> %a0, <4 x i64> %a1) {
1960; CHECK-LABEL: test_mm256_permute2x128_si256:
1961; CHECK:       # %bb.0:
1962; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
1963; CHECK-NEXT:    ret{{[l|q]}}
1964  %res = shufflevector <4 x i64> %a0, <4 x i64> %a1, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
1965  ret <4 x i64> %res
1966}
1967declare <4 x i64> @llvm.x86.avx2.vperm2i128(<4 x i64>, <4 x i64>, i8) nounwind readonly
1968
1969define <4 x i64> @test_mm256_permute4x64_epi64(<4 x i64> %a0) {
1970; CHECK-LABEL: test_mm256_permute4x64_epi64:
1971; CHECK:       # %bb.0:
1972; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[3,0,2,0]
1973; CHECK-NEXT:    ret{{[l|q]}}
1974  %res = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> <i32 3, i32 0, i32 2, i32 0>
1975  ret <4 x i64> %res
1976}
1977
1978define <4 x double> @test_mm256_permute4x64_pd(<4 x double> %a0) {
1979; CHECK-LABEL: test_mm256_permute4x64_pd:
1980; CHECK:       # %bb.0:
1981; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[1,2,1,0]
1982; CHECK-NEXT:    ret{{[l|q]}}
1983  %res = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> <i32 1, i32 2, i32 1, i32 0>
1984  ret <4 x double> %res
1985}
1986
1987define <4 x i64> @test_mm256_permutevar8x32_epi32(<4 x i64> %a0, <4 x i64> %a1) {
1988; CHECK-LABEL: test_mm256_permutevar8x32_epi32:
1989; CHECK:       # %bb.0:
1990; CHECK-NEXT:    vpermps %ymm0, %ymm1, %ymm0
1991; CHECK-NEXT:    ret{{[l|q]}}
1992  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
1993  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1994  %call = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %arg0, <8 x i32> %arg1)
1995  %res = bitcast <8 x i32> %call to <4 x i64>
1996  ret <4 x i64> %res
1997}
1998declare <8 x i32> @llvm.x86.avx2.permd(<8 x i32>, <8 x i32>) nounwind readonly
1999
2000define <8 x float> @test_mm256_permutevar8x32_ps(<8 x float> %a0, <4 x i64> %a1) {
2001; CHECK-LABEL: test_mm256_permutevar8x32_ps:
2002; CHECK:       # %bb.0:
2003; CHECK-NEXT:    vpermps %ymm0, %ymm1, %ymm0
2004; CHECK-NEXT:    ret{{[l|q]}}
2005  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
2006  %res = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a0, <8 x i32> %arg1)
2007  ret <8 x float> %res
2008}
2009declare <8 x float> @llvm.x86.avx2.permps(<8 x float>, <8 x i32>) nounwind readonly
2010
2011define <4 x i64> @test_mm256_sad_epu8(<4 x i64> %a0, <4 x i64> %a1) {
2012; CHECK-LABEL: test_mm256_sad_epu8:
2013; CHECK:       # %bb.0:
2014; CHECK-NEXT:    vpsadbw %ymm1, %ymm0, %ymm0
2015; CHECK-NEXT:    ret{{[l|q]}}
2016  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
2017  %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
2018  %res = call <4 x i64> @llvm.x86.avx2.psad.bw(<32 x i8> %arg0, <32 x i8> %arg1)
2019  ret <4 x i64> %res
2020}
2021declare <4 x i64> @llvm.x86.avx2.psad.bw(<32 x i8>, <32 x i8>) nounwind readnone
2022
2023define <4 x i64> @test_mm256_shuffle_epi32(<4 x i64> %a0) {
2024; CHECK-LABEL: test_mm256_shuffle_epi32:
2025; CHECK:       # %bb.0:
2026; CHECK-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,3,0,0,7,7,4,4]
2027; CHECK-NEXT:    ret{{[l|q]}}
2028  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2029  %shuf = shufflevector <8 x i32> %arg0, <8 x i32> undef, <8 x i32> <i32 3, i32 3, i32 0, i32 0, i32 7, i32 7, i32 4, i32 4>
2030  %res = bitcast <8 x i32> %shuf to <4 x i64>
2031  ret <4 x i64> %res
2032}
2033
2034define <4 x i64> @test_mm256_shuffle_epi8(<4 x i64> %a0, <4 x i64> %a1) {
2035; CHECK-LABEL: test_mm256_shuffle_epi8:
2036; CHECK:       # %bb.0:
2037; CHECK-NEXT:    vpshufb %ymm1, %ymm0, %ymm0
2038; CHECK-NEXT:    ret{{[l|q]}}
2039  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
2040  %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
2041  %shuf = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %arg0, <32 x i8> %arg1)
2042  %res = bitcast <32 x i8> %shuf to <4 x i64>
2043  ret <4 x i64> %res
2044}
2045declare <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8>, <32 x i8>) nounwind readnone
2046
2047define <4 x i64> @test_mm256_shufflehi_epi16(<4 x i64> %a0) {
2048; CHECK-LABEL: test_mm256_shufflehi_epi16:
2049; CHECK:       # %bb.0:
2050; CHECK-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,6,6,5,8,9,10,11,15,14,14,13]
2051; CHECK-NEXT:    ret{{[l|q]}}
2052  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2053  %shuf = shufflevector <16 x i16> %arg0, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 6, i32 6, i32 5, i32 8, i32 9, i32 10, i32 11, i32 15, i32 14, i32 14, i32 13>
2054  %res = bitcast <16 x i16> %shuf to <4 x i64>
2055  ret <4 x i64> %res
2056}
2057
2058define <4 x i64> @test_mm256_shufflelo_epi16(<4 x i64> %a0) {
2059; CHECK-LABEL: test_mm256_shufflelo_epi16:
2060; CHECK:       # %bb.0:
2061; CHECK-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[3,0,1,1,4,5,6,7,11,8,9,9,12,13,14,15]
2062; CHECK-NEXT:    ret{{[l|q]}}
2063  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2064  %shuf = shufflevector <16 x i16> %arg0, <16 x i16> undef, <16 x i32> <i32 3, i32 0, i32 1, i32 1, i32 4, i32 5, i32 6, i32 7, i32 11, i32 8, i32 9, i32 9, i32 12, i32 13, i32 14, i32 15>
2065  %res = bitcast <16 x i16> %shuf to <4 x i64>
2066  ret <4 x i64> %res
2067}
2068
2069define <4 x i64> @test_mm256_sign_epi8(<4 x i64> %a0, <4 x i64> %a1) {
2070; CHECK-LABEL: test_mm256_sign_epi8:
2071; CHECK:       # %bb.0:
2072; CHECK-NEXT:    vpsignb %ymm1, %ymm0, %ymm0
2073; CHECK-NEXT:    ret{{[l|q]}}
2074  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
2075  %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
2076  %call = call <32 x i8> @llvm.x86.avx2.psign.b(<32 x i8> %arg0, <32 x i8> %arg1)
2077  %res = bitcast <32 x i8> %call to <4 x i64>
2078  ret <4 x i64> %res
2079}
2080declare <32 x i8> @llvm.x86.avx2.psign.b(<32 x i8>, <32 x i8>) nounwind readnone
2081
2082define <4 x i64> @test_mm256_sign_epi16(<4 x i64> %a0, <4 x i64> %a1) {
2083; CHECK-LABEL: test_mm256_sign_epi16:
2084; CHECK:       # %bb.0:
2085; CHECK-NEXT:    vpsignw %ymm1, %ymm0, %ymm0
2086; CHECK-NEXT:    ret{{[l|q]}}
2087  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2088  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
2089  %call = call <16 x i16> @llvm.x86.avx2.psign.w(<16 x i16> %arg0, <16 x i16> %arg1)
2090  %res = bitcast <16 x i16> %call to <4 x i64>
2091  ret <4 x i64> %res
2092}
2093declare <16 x i16> @llvm.x86.avx2.psign.w(<16 x i16>, <16 x i16>) nounwind readnone
2094
2095define <4 x i64> @test_mm256_sign_epi32(<4 x i64> %a0, <4 x i64> %a1) {
2096; CHECK-LABEL: test_mm256_sign_epi32:
2097; CHECK:       # %bb.0:
2098; CHECK-NEXT:    vpsignd %ymm1, %ymm0, %ymm0
2099; CHECK-NEXT:    ret{{[l|q]}}
2100  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2101  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
2102  %call = call <8 x i32> @llvm.x86.avx2.psign.d(<8 x i32> %arg0, <8 x i32> %arg1)
2103  %res = bitcast <8 x i32> %call to <4 x i64>
2104  ret <4 x i64> %res
2105}
2106declare <8 x i32> @llvm.x86.avx2.psign.d(<8 x i32>, <8 x i32>) nounwind readnone
2107
2108define <4 x i64> @test_mm256_sll_epi16(<4 x i64> %a0, <2 x i64> %a1) {
2109; CHECK-LABEL: test_mm256_sll_epi16:
2110; CHECK:       # %bb.0:
2111; CHECK-NEXT:    vpsllw %xmm1, %ymm0, %ymm0
2112; CHECK-NEXT:    ret{{[l|q]}}
2113  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2114  %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
2115  %res = call <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16> %arg0, <8 x i16> %arg1)
2116  %bc = bitcast <16 x i16> %res to <4 x i64>
2117  ret <4 x i64> %bc
2118}
2119declare <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16>, <8 x i16>) nounwind readnone
2120
2121define <4 x i64> @test_mm256_sll_epi32(<4 x i64> %a0, <2 x i64> %a1) {
2122; CHECK-LABEL: test_mm256_sll_epi32:
2123; CHECK:       # %bb.0:
2124; CHECK-NEXT:    vpslld %xmm1, %ymm0, %ymm0
2125; CHECK-NEXT:    ret{{[l|q]}}
2126  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2127  %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
2128  %res = call <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32> %arg0, <4 x i32> %arg1)
2129  %bc = bitcast <8 x i32> %res to <4 x i64>
2130  ret <4 x i64> %bc
2131}
2132declare <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32>, <4 x i32>) nounwind readnone
2133
2134define <4 x i64> @test_mm256_sll_epi64(<4 x i64> %a0, <2 x i64> %a1) {
2135; CHECK-LABEL: test_mm256_sll_epi64:
2136; CHECK:       # %bb.0:
2137; CHECK-NEXT:    vpsllq %xmm1, %ymm0, %ymm0
2138; CHECK-NEXT:    ret{{[l|q]}}
2139  %res = call <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64> %a0, <2 x i64> %a1)
2140  ret <4 x i64> %res
2141}
2142declare <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64>, <2 x i64>) nounwind readnone
2143
2144define <4 x i64> @test_mm256_slli_epi16(<4 x i64> %a0) {
2145; CHECK-LABEL: test_mm256_slli_epi16:
2146; CHECK:       # %bb.0:
2147; CHECK-NEXT:    vpsllw $3, %ymm0, %ymm0
2148; CHECK-NEXT:    ret{{[l|q]}}
2149  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2150  %res = call <16 x i16> @llvm.x86.avx2.pslli.w(<16 x i16> %arg0, i32 3)
2151  %bc = bitcast <16 x i16> %res to <4 x i64>
2152  ret <4 x i64> %bc
2153}
2154declare <16 x i16> @llvm.x86.avx2.pslli.w(<16 x i16>, i32) nounwind readnone
2155
2156define <4 x i64> @test_mm256_slli_epi32(<4 x i64> %a0) {
2157; CHECK-LABEL: test_mm256_slli_epi32:
2158; CHECK:       # %bb.0:
2159; CHECK-NEXT:    vpslld $3, %ymm0, %ymm0
2160; CHECK-NEXT:    ret{{[l|q]}}
2161  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2162  %res = call <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32> %arg0, i32 3)
2163  %bc = bitcast <8 x i32> %res to <4 x i64>
2164  ret <4 x i64> %bc
2165}
2166declare <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32>, i32) nounwind readnone
2167
2168define <4 x i64> @test_mm256_slli_epi64(<4 x i64> %a0) {
2169; CHECK-LABEL: test_mm256_slli_epi64:
2170; CHECK:       # %bb.0:
2171; CHECK-NEXT:    vpsllq $3, %ymm0, %ymm0
2172; CHECK-NEXT:    ret{{[l|q]}}
2173  %res = call <4 x i64> @llvm.x86.avx2.pslli.q(<4 x i64> %a0, i32 3)
2174  ret <4 x i64> %res
2175}
2176declare <4 x i64> @llvm.x86.avx2.pslli.q(<4 x i64>, i32) nounwind readnone
2177
2178define <4 x i64> @test_mm256_slli_si256(<4 x i64> %a0) {
2179; CHECK-LABEL: test_mm256_slli_si256:
2180; CHECK:       # %bb.0:
2181; CHECK-NEXT:    vpslldq {{.*#+}} ymm0 = zero,zero,zero,ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12],zero,zero,zero,ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28]
2182; CHECK-NEXT:    ret{{[l|q]}}
2183  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
2184  %shuf = shufflevector <32 x i8> zeroinitializer, <32 x i8> %arg0, <32 x i32> <i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60>
2185  %res = bitcast <32 x i8> %shuf to <4 x i64>
2186  ret <4 x i64> %res
2187}
2188
2189define <2 x i64> @test_mm_sllv_epi32(<2 x i64> %a0, <2 x i64> %a1) {
2190; CHECK-LABEL: test_mm_sllv_epi32:
2191; CHECK:       # %bb.0:
2192; CHECK-NEXT:    vpsllvd %xmm1, %xmm0, %xmm0
2193; CHECK-NEXT:    ret{{[l|q]}}
2194  %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
2195  %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
2196  %res = call <4 x i32> @llvm.x86.avx2.psllv.d(<4 x i32> %arg0, <4 x i32> %arg1)
2197  %bc = bitcast <4 x i32> %res to <2 x i64>
2198  ret <2 x i64> %bc
2199}
2200declare <4 x i32> @llvm.x86.avx2.psllv.d(<4 x i32>, <4 x i32>) nounwind readnone
2201
2202define <4 x i64> @test_mm256_sllv_epi32(<4 x i64> %a0, <4 x i64> %a1) {
2203; CHECK-LABEL: test_mm256_sllv_epi32:
2204; CHECK:       # %bb.0:
2205; CHECK-NEXT:    vpsllvd %ymm1, %ymm0, %ymm0
2206; CHECK-NEXT:    ret{{[l|q]}}
2207  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2208  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
2209  %res = call <8 x i32> @llvm.x86.avx2.psllv.d.256(<8 x i32> %arg0, <8 x i32> %arg1)
2210  %bc = bitcast <8 x i32> %res to <4 x i64>
2211  ret <4 x i64> %bc
2212}
2213declare <8 x i32> @llvm.x86.avx2.psllv.d.256(<8 x i32>, <8 x i32>) nounwind readnone
2214
2215define <2 x i64> @test_mm_sllv_epi64(<2 x i64> %a0, <2 x i64> %a1) {
2216; CHECK-LABEL: test_mm_sllv_epi64:
2217; CHECK:       # %bb.0:
2218; CHECK-NEXT:    vpsllvq %xmm1, %xmm0, %xmm0
2219; CHECK-NEXT:    ret{{[l|q]}}
2220  %res = call <2 x i64> @llvm.x86.avx2.psllv.q(<2 x i64> %a0, <2 x i64> %a1)
2221  ret <2 x i64> %res
2222}
2223declare <2 x i64> @llvm.x86.avx2.psllv.q(<2 x i64>, <2 x i64>) nounwind readnone
2224
2225define <4 x i64> @test_mm256_sllv_epi64(<4 x i64> %a0, <4 x i64> %a1) {
2226; CHECK-LABEL: test_mm256_sllv_epi64:
2227; CHECK:       # %bb.0:
2228; CHECK-NEXT:    vpsllvq %ymm1, %ymm0, %ymm0
2229; CHECK-NEXT:    ret{{[l|q]}}
2230  %res = call <4 x i64> @llvm.x86.avx2.psllv.q.256(<4 x i64> %a0, <4 x i64> %a1)
2231  ret <4 x i64> %res
2232}
2233declare <4 x i64> @llvm.x86.avx2.psllv.q.256(<4 x i64>, <4 x i64>) nounwind readnone
2234
2235define <4 x i64> @test_mm256_sra_epi16(<4 x i64> %a0, <2 x i64> %a1) {
2236; CHECK-LABEL: test_mm256_sra_epi16:
2237; CHECK:       # %bb.0:
2238; CHECK-NEXT:    vpsraw %xmm1, %ymm0, %ymm0
2239; CHECK-NEXT:    ret{{[l|q]}}
2240  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2241  %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
2242  %res = call <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16> %arg0, <8 x i16> %arg1)
2243  %bc = bitcast <16 x i16> %res to <4 x i64>
2244  ret <4 x i64> %bc
2245}
2246declare <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16>, <8 x i16>) nounwind readnone
2247
2248define <4 x i64> @test_mm256_sra_epi32(<4 x i64> %a0, <2 x i64> %a1) {
2249; CHECK-LABEL: test_mm256_sra_epi32:
2250; CHECK:       # %bb.0:
2251; CHECK-NEXT:    vpsrad %xmm1, %ymm0, %ymm0
2252; CHECK-NEXT:    ret{{[l|q]}}
2253  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2254  %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
2255  %res = call <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32> %arg0, <4 x i32> %arg1)
2256  %bc = bitcast <8 x i32> %res to <4 x i64>
2257  ret <4 x i64> %bc
2258}
2259declare <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32>, <4 x i32>) nounwind readnone
2260
2261define <4 x i64> @test_mm256_srai_epi16(<4 x i64> %a0) {
2262; CHECK-LABEL: test_mm256_srai_epi16:
2263; CHECK:       # %bb.0:
2264; CHECK-NEXT:    vpsraw $3, %ymm0, %ymm0
2265; CHECK-NEXT:    ret{{[l|q]}}
2266  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2267  %res = call <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16> %arg0, i32 3)
2268  %bc = bitcast <16 x i16> %res to <4 x i64>
2269  ret <4 x i64> %bc
2270}
2271declare <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16>, i32) nounwind readnone
2272
2273define <4 x i64> @test_mm256_srai_epi32(<4 x i64> %a0) {
2274; CHECK-LABEL: test_mm256_srai_epi32:
2275; CHECK:       # %bb.0:
2276; CHECK-NEXT:    vpsrad $3, %ymm0, %ymm0
2277; CHECK-NEXT:    ret{{[l|q]}}
2278  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2279  %res = call <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32> %arg0, i32 3)
2280  %bc = bitcast <8 x i32> %res to <4 x i64>
2281  ret <4 x i64> %bc
2282}
2283declare <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32>, i32) nounwind readnone
2284
2285define <2 x i64> @test_mm_srav_epi32(<2 x i64> %a0, <2 x i64> %a1) {
2286; CHECK-LABEL: test_mm_srav_epi32:
2287; CHECK:       # %bb.0:
2288; CHECK-NEXT:    vpsravd %xmm1, %xmm0, %xmm0
2289; CHECK-NEXT:    ret{{[l|q]}}
2290  %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
2291  %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
2292  %res = call <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32> %arg0, <4 x i32> %arg1)
2293  %bc = bitcast <4 x i32> %res to <2 x i64>
2294  ret <2 x i64> %bc
2295}
2296declare <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32>, <4 x i32>) nounwind readnone
2297
2298define <4 x i64> @test_mm256_srav_epi32(<4 x i64> %a0, <4 x i64> %a1) {
2299; CHECK-LABEL: test_mm256_srav_epi32:
2300; CHECK:       # %bb.0:
2301; CHECK-NEXT:    vpsravd %ymm1, %ymm0, %ymm0
2302; CHECK-NEXT:    ret{{[l|q]}}
2303  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2304  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
2305  %res = call <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32> %arg0, <8 x i32> %arg1)
2306  %bc = bitcast <8 x i32> %res to <4 x i64>
2307  ret <4 x i64> %bc
2308}
2309declare <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32>, <8 x i32>) nounwind readnone
2310
2311define <4 x i64> @test_mm256_srl_epi16(<4 x i64> %a0, <2 x i64> %a1) {
2312; CHECK-LABEL: test_mm256_srl_epi16:
2313; CHECK:       # %bb.0:
2314; CHECK-NEXT:    vpsrlw %xmm1, %ymm0, %ymm0
2315; CHECK-NEXT:    ret{{[l|q]}}
2316  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2317  %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
2318  %res = call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> %arg0, <8 x i16> %arg1)
2319  %bc = bitcast <16 x i16> %res to <4 x i64>
2320  ret <4 x i64> %bc
2321}
2322declare <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16>, <8 x i16>) nounwind readnone
2323
2324define <4 x i64> @test_mm256_srl_epi32(<4 x i64> %a0, <2 x i64> %a1) {
2325; CHECK-LABEL: test_mm256_srl_epi32:
2326; CHECK:       # %bb.0:
2327; CHECK-NEXT:    vpsrld %xmm1, %ymm0, %ymm0
2328; CHECK-NEXT:    ret{{[l|q]}}
2329  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2330  %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
2331  %res = call <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32> %arg0, <4 x i32> %arg1)
2332  %bc = bitcast <8 x i32> %res to <4 x i64>
2333  ret <4 x i64> %bc
2334}
2335declare <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32>, <4 x i32>) nounwind readnone
2336
2337define <4 x i64> @test_mm256_srl_epi64(<4 x i64> %a0, <2 x i64> %a1) {
2338; CHECK-LABEL: test_mm256_srl_epi64:
2339; CHECK:       # %bb.0:
2340; CHECK-NEXT:    vpsrlq %xmm1, %ymm0, %ymm0
2341; CHECK-NEXT:    ret{{[l|q]}}
2342  %res = call <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64> %a0, <2 x i64> %a1)
2343  ret <4 x i64> %res
2344}
2345declare <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64>, <2 x i64>) nounwind readnone
2346
2347define <4 x i64> @test_mm256_srli_epi16(<4 x i64> %a0) {
2348; CHECK-LABEL: test_mm256_srli_epi16:
2349; CHECK:       # %bb.0:
2350; CHECK-NEXT:    vpsrlw $3, %ymm0, %ymm0
2351; CHECK-NEXT:    ret{{[l|q]}}
2352  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2353  %res = call <16 x i16> @llvm.x86.avx2.psrli.w(<16 x i16> %arg0, i32 3)
2354  %bc = bitcast <16 x i16> %res to <4 x i64>
2355  ret <4 x i64> %bc
2356}
2357declare <16 x i16> @llvm.x86.avx2.psrli.w(<16 x i16>, i32) nounwind readnone
2358
2359define <4 x i64> @test_mm256_srli_epi32(<4 x i64> %a0) {
2360; CHECK-LABEL: test_mm256_srli_epi32:
2361; CHECK:       # %bb.0:
2362; CHECK-NEXT:    vpsrld $3, %ymm0, %ymm0
2363; CHECK-NEXT:    ret{{[l|q]}}
2364  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2365  %res = call <8 x i32> @llvm.x86.avx2.psrli.d(<8 x i32> %arg0, i32 3)
2366  %bc = bitcast <8 x i32> %res to <4 x i64>
2367  ret <4 x i64> %bc
2368}
2369declare <8 x i32> @llvm.x86.avx2.psrli.d(<8 x i32>, i32) nounwind readnone
2370
2371define <4 x i64> @test_mm256_srli_epi64(<4 x i64> %a0) {
2372; CHECK-LABEL: test_mm256_srli_epi64:
2373; CHECK:       # %bb.0:
2374; CHECK-NEXT:    vpsrlq $3, %ymm0, %ymm0
2375; CHECK-NEXT:    ret{{[l|q]}}
2376  %res = call <4 x i64> @llvm.x86.avx2.psrli.q(<4 x i64> %a0, i32 3)
2377  ret <4 x i64> %res
2378}
2379declare <4 x i64> @llvm.x86.avx2.psrli.q(<4 x i64>, i32) nounwind readnone
2380
2381define <4 x i64> @test_mm256_srli_si256(<4 x i64> %a0) {
2382; CHECK-LABEL: test_mm256_srli_si256:
2383; CHECK:       # %bb.0:
2384; CHECK-NEXT:    vpsrldq {{.*#+}} ymm0 = ymm0[3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,ymm0[19,20,21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero
2385; CHECK-NEXT:    ret{{[l|q]}}
2386  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
2387  %shuf = shufflevector <32 x i8> %arg0, <32 x i8> zeroinitializer, <32 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50>
2388  %res = bitcast <32 x i8> %shuf to <4 x i64>
2389  ret <4 x i64> %res
2390}
2391
2392define <2 x i64> @test_mm_srlv_epi32(<2 x i64> %a0, <2 x i64> %a1) {
2393; CHECK-LABEL: test_mm_srlv_epi32:
2394; CHECK:       # %bb.0:
2395; CHECK-NEXT:    vpsrlvd %xmm1, %xmm0, %xmm0
2396; CHECK-NEXT:    ret{{[l|q]}}
2397  %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
2398  %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
2399  %res = call <4 x i32> @llvm.x86.avx2.psrlv.d(<4 x i32> %arg0, <4 x i32> %arg1)
2400  %bc = bitcast <4 x i32> %res to <2 x i64>
2401  ret <2 x i64> %bc
2402}
2403declare <4 x i32> @llvm.x86.avx2.psrlv.d(<4 x i32>, <4 x i32>) nounwind readnone
2404
2405define <4 x i64> @test_mm256_srlv_epi32(<4 x i64> %a0, <4 x i64> %a1) {
2406; CHECK-LABEL: test_mm256_srlv_epi32:
2407; CHECK:       # %bb.0:
2408; CHECK-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm0
2409; CHECK-NEXT:    ret{{[l|q]}}
2410  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2411  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
2412  %res = call <8 x i32> @llvm.x86.avx2.psrlv.d.256(<8 x i32> %arg0, <8 x i32> %arg1)
2413  %bc = bitcast <8 x i32> %res to <4 x i64>
2414  ret <4 x i64> %bc
2415}
2416declare <8 x i32> @llvm.x86.avx2.psrlv.d.256(<8 x i32>, <8 x i32>) nounwind readnone
2417
2418define <2 x i64> @test_mm_srlv_epi64(<2 x i64> %a0, <2 x i64> %a1) {
2419; CHECK-LABEL: test_mm_srlv_epi64:
2420; CHECK:       # %bb.0:
2421; CHECK-NEXT:    vpsrlvq %xmm1, %xmm0, %xmm0
2422; CHECK-NEXT:    ret{{[l|q]}}
2423  %res = call <2 x i64> @llvm.x86.avx2.psrlv.q(<2 x i64> %a0, <2 x i64> %a1)
2424  ret <2 x i64> %res
2425}
2426declare <2 x i64> @llvm.x86.avx2.psrlv.q(<2 x i64>, <2 x i64>) nounwind readnone
2427
2428define <4 x i64> @test_mm256_srlv_epi64(<4 x i64> %a0, <4 x i64> %a1) {
2429; CHECK-LABEL: test_mm256_srlv_epi64:
2430; CHECK:       # %bb.0:
2431; CHECK-NEXT:    vpsrlvq %ymm1, %ymm0, %ymm0
2432; CHECK-NEXT:    ret{{[l|q]}}
2433  %res = call <4 x i64> @llvm.x86.avx2.psrlv.q.256(<4 x i64> %a0, <4 x i64> %a1)
2434  ret <4 x i64> %res
2435}
2436declare <4 x i64> @llvm.x86.avx2.psrlv.q.256(<4 x i64>, <4 x i64>) nounwind readnone
2437
2438define <4 x i64> @test_mm256_stream_load_si256(<4 x i64> *%a0) {
2439; X86-LABEL: test_mm256_stream_load_si256:
2440; X86:       # %bb.0:
2441; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
2442; X86-NEXT:    vmovntdqa (%eax), %ymm0
2443; X86-NEXT:    retl
2444;
2445; X64-LABEL: test_mm256_stream_load_si256:
2446; X64:       # %bb.0:
2447; X64-NEXT:    vmovntdqa (%rdi), %ymm0
2448; X64-NEXT:    retq
2449  %arg0 = bitcast <4 x i64> *%a0 to i8*
2450  %res = call <4 x i64> @llvm.x86.avx2.movntdqa(i8* %arg0)
2451  ret <4 x i64> %res
2452}
2453declare <4 x i64> @llvm.x86.avx2.movntdqa(i8*) nounwind readonly
2454
2455define <4 x i64> @test_mm256_sub_epi8(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2456; CHECK-LABEL: test_mm256_sub_epi8:
2457; CHECK:       # %bb.0:
2458; CHECK-NEXT:    vpsubb %ymm1, %ymm0, %ymm0
2459; CHECK-NEXT:    ret{{[l|q]}}
2460  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
2461  %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
2462  %res = sub <32 x i8> %arg0, %arg1
2463  %bc = bitcast <32 x i8> %res to <4 x i64>
2464  ret <4 x i64> %bc
2465}
2466
2467define <4 x i64> @test_mm256_sub_epi16(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2468; CHECK-LABEL: test_mm256_sub_epi16:
2469; CHECK:       # %bb.0:
2470; CHECK-NEXT:    vpsubw %ymm1, %ymm0, %ymm0
2471; CHECK-NEXT:    ret{{[l|q]}}
2472  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2473  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
2474  %res = sub <16 x i16> %arg0, %arg1
2475  %bc = bitcast <16 x i16> %res to <4 x i64>
2476  ret <4 x i64> %bc
2477}
2478
2479define <4 x i64> @test_mm256_sub_epi32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2480; CHECK-LABEL: test_mm256_sub_epi32:
2481; CHECK:       # %bb.0:
2482; CHECK-NEXT:    vpsubd %ymm1, %ymm0, %ymm0
2483; CHECK-NEXT:    ret{{[l|q]}}
2484  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2485  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
2486  %res = sub <8 x i32> %arg0, %arg1
2487  %bc = bitcast <8 x i32> %res to <4 x i64>
2488  ret <4 x i64> %bc
2489}
2490
2491define <4 x i64> @test_mm256_sub_epi64(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2492; CHECK-LABEL: test_mm256_sub_epi64:
2493; CHECK:       # %bb.0:
2494; CHECK-NEXT:    vpsubq %ymm1, %ymm0, %ymm0
2495; CHECK-NEXT:    ret{{[l|q]}}
2496  %res = sub <4 x i64> %a0, %a1
2497  ret <4 x i64> %res
2498}
2499
2500define <4 x i64> @test_mm256_subs_epi8(<4 x i64> %a0, <4 x i64> %a1) {
2501; CHECK-LABEL: test_mm256_subs_epi8:
2502; CHECK:       # %bb.0:
2503; CHECK-NEXT:    vpsubsb %ymm1, %ymm0, %ymm0
2504; CHECK-NEXT:    ret{{[l|q]}}
2505  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
2506  %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
2507  %res = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> %arg0, <32 x i8> %arg1)
2508  %bc = bitcast <32 x i8> %res to <4 x i64>
2509  ret <4 x i64> %bc
2510}
2511declare <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8>, <32 x i8>) nounwind readnone
2512
2513define <4 x i64> @test_mm256_subs_epi16(<4 x i64> %a0, <4 x i64> %a1) {
2514; CHECK-LABEL: test_mm256_subs_epi16:
2515; CHECK:       # %bb.0:
2516; CHECK-NEXT:    vpsubsw %ymm1, %ymm0, %ymm0
2517; CHECK-NEXT:    ret{{[l|q]}}
2518  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2519  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
2520  %res = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> %arg0, <16 x i16> %arg1)
2521  %bc = bitcast <16 x i16> %res to <4 x i64>
2522  ret <4 x i64> %bc
2523}
2524declare <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16>, <16 x i16>) nounwind readnone
2525
2526define <4 x i64> @test_mm256_subs_epu8(<4 x i64> %a0, <4 x i64> %a1) {
2527; CHECK-LABEL: test_mm256_subs_epu8:
2528; CHECK:       # %bb.0:
2529; CHECK-NEXT:    vpsubusb %ymm1, %ymm0, %ymm0
2530; CHECK-NEXT:    ret{{[l|q]}}
2531  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
2532  %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
2533  %res = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> %arg0, <32 x i8> %arg1)
2534  %bc = bitcast <32 x i8> %res to <4 x i64>
2535  ret <4 x i64> %bc
2536}
2537declare <32 x i8> @llvm.usub.sat.v32i8(<32 x i8>, <32 x i8>)
2538
2539define <4 x i64> @test_mm256_subs_epu16(<4 x i64> %a0, <4 x i64> %a1) {
2540; CHECK-LABEL: test_mm256_subs_epu16:
2541; CHECK:       # %bb.0:
2542; CHECK-NEXT:    vpsubusw %ymm1, %ymm0, %ymm0
2543; CHECK-NEXT:    ret{{[l|q]}}
2544  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2545  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
2546  %res = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> %arg0, <16 x i16> %arg1)
2547  %bc = bitcast <16 x i16> %res to <4 x i64>
2548  ret <4 x i64> %bc
2549}
2550declare <16 x i16> @llvm.usub.sat.v16i16(<16 x i16>, <16 x i16>)
2551
2552define <4 x i64> @test_mm256_unpackhi_epi8(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2553; CHECK-LABEL: test_mm256_unpackhi_epi8:
2554; CHECK:       # %bb.0:
2555; CHECK-NEXT:    vpunpckhbw {{.*#+}} ymm0 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
2556; CHECK-NEXT:    ret{{[l|q]}}
2557  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
2558  %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
2559  %res = shufflevector <32 x i8> %arg0, <32 x i8> %arg1, <32 x i32> <i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63>
2560  %bc = bitcast <32 x i8> %res to <4 x i64>
2561  ret <4 x i64> %bc
2562}
2563
2564define <4 x i64> @test_mm256_unpackhi_epi16(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2565; CHECK-LABEL: test_mm256_unpackhi_epi16:
2566; CHECK:       # %bb.0:
2567; CHECK-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15]
2568; CHECK-NEXT:    ret{{[l|q]}}
2569  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2570  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
2571  %res = shufflevector <16 x i16> %arg0, <16 x i16> %arg1, <16 x i32> <i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
2572  %bc = bitcast <16 x i16> %res to <4 x i64>
2573  ret <4 x i64> %bc
2574}
2575
2576define <4 x i64> @test_mm256_unpackhi_epi32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2577; CHECK-LABEL: test_mm256_unpackhi_epi32:
2578; CHECK:       # %bb.0:
2579; CHECK-NEXT:    vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
2580; CHECK-NEXT:    ret{{[l|q]}}
2581  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2582  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
2583  %res = shufflevector <8 x i32> %arg0, <8 x i32> %arg1, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
2584  %bc = bitcast <8 x i32> %res to <4 x i64>
2585  ret <4 x i64> %bc
2586}
2587
2588define <4 x i64> @test_mm256_unpackhi_epi64(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2589; CHECK-LABEL: test_mm256_unpackhi_epi64:
2590; CHECK:       # %bb.0:
2591; CHECK-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
2592; CHECK-NEXT:    ret{{[l|q]}}
2593  %res = shufflevector <4 x i64> %a0, <4 x i64> %a1, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
2594  ret <4 x i64> %res
2595}
2596
2597define <4 x i64> @test_mm256_unpacklo_epi8(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2598; CHECK-LABEL: test_mm256_unpacklo_epi8:
2599; CHECK:       # %bb.0:
2600; CHECK-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
2601; CHECK-NEXT:    ret{{[l|q]}}
2602  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
2603  %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
2604  %res = shufflevector <32 x i8> %arg0, <32 x i8> %arg1, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55>
2605  %bc = bitcast <32 x i8> %res to <4 x i64>
2606  ret <4 x i64> %bc
2607}
2608
2609define <4 x i64> @test_mm256_unpacklo_epi16(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2610; CHECK-LABEL: test_mm256_unpacklo_epi16:
2611; CHECK:       # %bb.0:
2612; CHECK-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11]
2613; CHECK-NEXT:    ret{{[l|q]}}
2614  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2615  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
2616  %res = shufflevector <16 x i16> %arg0, <16 x i16> %arg1, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27>
2617  %bc = bitcast <16 x i16> %res to <4 x i64>
2618  ret <4 x i64> %bc
2619}
2620
2621define <4 x i64> @test_mm256_unpacklo_epi32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2622; CHECK-LABEL: test_mm256_unpacklo_epi32:
2623; CHECK:       # %bb.0:
2624; CHECK-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
2625; CHECK-NEXT:    ret{{[l|q]}}
2626  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2627  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
2628  %res = shufflevector <8 x i32> %arg0, <8 x i32> %arg1, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
2629  %bc = bitcast <8 x i32> %res to <4 x i64>
2630  ret <4 x i64> %bc
2631}
2632
2633define <4 x i64> @test_mm256_unpacklo_epi64(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2634; CHECK-LABEL: test_mm256_unpacklo_epi64:
2635; CHECK:       # %bb.0:
2636; CHECK-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
2637; CHECK-NEXT:    ret{{[l|q]}}
2638  %res = shufflevector <4 x i64> %a0, <4 x i64> %a1, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
2639  ret <4 x i64> %res
2640}
2641
2642define <4 x i64> @test_mm256_xor_si256(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2643; CHECK-LABEL: test_mm256_xor_si256:
2644; CHECK:       # %bb.0:
2645; CHECK-NEXT:    vxorps %ymm1, %ymm0, %ymm0
2646; CHECK-NEXT:    ret{{[l|q]}}
2647  %res = xor <4 x i64> %a0, %a1
2648  ret <4 x i64> %res
2649}
2650
2651declare <8 x float> @llvm.x86.avx.cmp.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone
2652
2653declare <4 x double> @llvm.x86.avx.cmp.pd.256(<4 x double>, <4 x double>, i8) nounwind readnone
2654