1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=CHECK --check-prefix=X86
3; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=CHECK --check-prefix=X64
4
5; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/avx2-builtins.c
6
7define <4 x i64> @test_mm256_abs_epi8(<4 x i64> %a0) {
8; CHECK-LABEL: test_mm256_abs_epi8:
9; CHECK:       # %bb.0:
10; CHECK-NEXT:    vpabsb %ymm0, %ymm0
11; CHECK-NEXT:    ret{{[l|q]}}
12  %arg = bitcast <4 x i64> %a0 to <32 x i8>
13  %sub = sub <32 x i8> zeroinitializer, %arg
14  %cmp = icmp sgt <32 x i8> %arg, zeroinitializer
15  %sel = select <32 x i1> %cmp, <32 x i8> %arg, <32 x i8> %sub
16  %res = bitcast <32 x i8> %sel to <4 x i64>
17  ret <4 x i64> %res
18}
19declare <32 x i8> @llvm.x86.avx2.pabs.b(<32 x i8>) nounwind readnone
20
21define <4 x i64> @test_mm256_abs_epi16(<4 x i64> %a0) {
22; CHECK-LABEL: test_mm256_abs_epi16:
23; CHECK:       # %bb.0:
24; CHECK-NEXT:    vpabsw %ymm0, %ymm0
25; CHECK-NEXT:    ret{{[l|q]}}
26  %arg = bitcast <4 x i64> %a0 to <16 x i16>
27  %sub = sub <16 x i16> zeroinitializer, %arg
28  %cmp = icmp sgt <16 x i16> %arg, zeroinitializer
29  %sel = select <16 x i1> %cmp, <16 x i16> %arg, <16 x i16> %sub
30  %res = bitcast <16 x i16> %sel to <4 x i64>
31  ret <4 x i64> %res
32}
33declare <16 x i16> @llvm.x86.avx2.pabs.w(<16 x i16>) nounwind readnone
34
35define <4 x i64> @test_mm256_abs_epi32(<4 x i64> %a0) {
36; CHECK-LABEL: test_mm256_abs_epi32:
37; CHECK:       # %bb.0:
38; CHECK-NEXT:    vpabsd %ymm0, %ymm0
39; CHECK-NEXT:    ret{{[l|q]}}
40  %arg = bitcast <4 x i64> %a0 to <8 x i32>
41  %sub = sub <8 x i32> zeroinitializer, %arg
42  %cmp = icmp sgt <8 x i32> %arg, zeroinitializer
43  %sel = select <8 x i1> %cmp, <8 x i32> %arg, <8 x i32> %sub
44  %res = bitcast <8 x i32> %sel to <4 x i64>
45  ret <4 x i64> %res
46}
47declare <8 x i32> @llvm.x86.avx2.pabs.d(<8 x i32>) nounwind readnone
48
49define <4 x i64> @test_mm256_add_epi8(<4 x i64> %a0, <4 x i64> %a1) nounwind {
50; CHECK-LABEL: test_mm256_add_epi8:
51; CHECK:       # %bb.0:
52; CHECK-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
53; CHECK-NEXT:    ret{{[l|q]}}
54  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
55  %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
56  %res = add <32 x i8> %arg0, %arg1
57  %bc = bitcast <32 x i8> %res to <4 x i64>
58  ret <4 x i64> %bc
59}
60
61define <4 x i64> @test_mm256_add_epi16(<4 x i64> %a0, <4 x i64> %a1) nounwind {
62; CHECK-LABEL: test_mm256_add_epi16:
63; CHECK:       # %bb.0:
64; CHECK-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
65; CHECK-NEXT:    ret{{[l|q]}}
66  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
67  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
68  %res = add <16 x i16> %arg0, %arg1
69  %bc = bitcast <16 x i16> %res to <4 x i64>
70  ret <4 x i64> %bc
71}
72
73define <4 x i64> @test_mm256_add_epi32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
74; CHECK-LABEL: test_mm256_add_epi32:
75; CHECK:       # %bb.0:
76; CHECK-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
77; CHECK-NEXT:    ret{{[l|q]}}
78  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
79  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
80  %res = add <8 x i32> %arg0, %arg1
81  %bc = bitcast <8 x i32> %res to <4 x i64>
82  ret <4 x i64> %bc
83}
84
85define <4 x i64> @test_mm256_add_epi64(<4 x i64> %a0, <4 x i64> %a1) nounwind {
86; CHECK-LABEL: test_mm256_add_epi64:
87; CHECK:       # %bb.0:
88; CHECK-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
89; CHECK-NEXT:    ret{{[l|q]}}
90  %res = add <4 x i64> %a0, %a1
91  ret <4 x i64> %res
92}
93
94define <4 x i64> @test_mm256_adds_epi8(<4 x i64> %a0, <4 x i64> %a1) {
95; CHECK-LABEL: test_mm256_adds_epi8:
96; CHECK:       # %bb.0:
97; CHECK-NEXT:    vpaddsb %ymm1, %ymm0, %ymm0
98; CHECK-NEXT:    ret{{[l|q]}}
99  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
100  %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
101  %res = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> %arg0, <32 x i8> %arg1)
102  %bc = bitcast <32 x i8> %res to <4 x i64>
103  ret <4 x i64> %bc
104}
105declare <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8>, <32 x i8>) nounwind readnone
106
107define <4 x i64> @test_mm256_adds_epi16(<4 x i64> %a0, <4 x i64> %a1) {
108; CHECK-LABEL: test_mm256_adds_epi16:
109; CHECK:       # %bb.0:
110; CHECK-NEXT:    vpaddsw %ymm1, %ymm0, %ymm0
111; CHECK-NEXT:    ret{{[l|q]}}
112  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
113  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
114  %res = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> %arg0, <16 x i16> %arg1)
115  %bc = bitcast <16 x i16> %res to <4 x i64>
116  ret <4 x i64> %bc
117}
118declare <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16>, <16 x i16>) nounwind readnone
119
120define <4 x i64> @test_mm256_adds_epu8(<4 x i64> %a0, <4 x i64> %a1) {
121; CHECK-LABEL: test_mm256_adds_epu8:
122; CHECK:       # %bb.0:
123; CHECK-NEXT:    vpaddusb %ymm1, %ymm0, %ymm0
124; CHECK-NEXT:    ret{{[l|q]}}
125  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
126  %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
127  %res = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> %arg0, <32 x i8> %arg1)
128  %bc = bitcast <32 x i8> %res to <4 x i64>
129  ret <4 x i64> %bc
130}
131declare <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8>, <32 x i8>)
132
133define <4 x i64> @test_mm256_adds_epu16(<4 x i64> %a0, <4 x i64> %a1) {
134; CHECK-LABEL: test_mm256_adds_epu16:
135; CHECK:       # %bb.0:
136; CHECK-NEXT:    vpaddusw %ymm1, %ymm0, %ymm0
137; CHECK-NEXT:    ret{{[l|q]}}
138  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
139  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
140  %res = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> %arg0, <16 x i16> %arg1)
141  %bc = bitcast <16 x i16> %res to <4 x i64>
142  ret <4 x i64> %bc
143}
144declare <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16>, <16 x i16>)
145
146define <4 x i64> @test_mm256_alignr_epi8(<4 x i64> %a0, <4 x i64> %a1) {
147; CHECK-LABEL: test_mm256_alignr_epi8:
148; CHECK:       # %bb.0:
149; CHECK-NEXT:    vpalignr {{.*#+}} ymm0 = ymm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm1[0,1],ymm0[18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm1[16,17]
150; CHECK-NEXT:    ret{{[l|q]}}
151  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
152  %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
153  %shuf = shufflevector <32 x i8> %arg0, <32 x i8> %arg1, <32 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 33, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48, i32 49>
154  %res = bitcast <32 x i8> %shuf to <4 x i64>
155  ret <4 x i64> %res
156}
157
158define <4 x i64> @test2_mm256_alignr_epi8(<4 x i64> %a0, <4 x i64> %a1) {
159; CHECK-LABEL: test2_mm256_alignr_epi8:
160; CHECK:       # %bb.0:
161; CHECK-NEXT:    vpalignr {{.*#+}} ymm0 = ymm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm1[0],ymm0[17,18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm1[16]
162; CHECK-NEXT:    ret{{[l|q]}}
163  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
164  %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
165  %shuf = shufflevector <32 x i8> %arg0, <32 x i8> %arg1, <32 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48>
166  %res = bitcast <32 x i8> %shuf to <4 x i64>
167  ret <4 x i64> %res
168}
169
170define <4 x i64> @test_mm256_and_si256(<4 x i64> %a0, <4 x i64> %a1) nounwind {
171; CHECK-LABEL: test_mm256_and_si256:
172; CHECK:       # %bb.0:
173; CHECK-NEXT:    vandps %ymm1, %ymm0, %ymm0
174; CHECK-NEXT:    ret{{[l|q]}}
175  %res = and <4 x i64> %a0, %a1
176  ret <4 x i64> %res
177}
178
179define <4 x i64> @test_mm256_andnot_si256(<4 x i64> %a0, <4 x i64> %a1) nounwind {
180; CHECK-LABEL: test_mm256_andnot_si256:
181; CHECK:       # %bb.0:
182; CHECK-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
183; CHECK-NEXT:    vpxor %ymm2, %ymm0, %ymm0
184; CHECK-NEXT:    vpand %ymm1, %ymm0, %ymm0
185; CHECK-NEXT:    ret{{[l|q]}}
186  %not = xor <4 x i64> %a0, <i64 -1, i64 -1, i64 -1, i64 -1>
187  %res = and <4 x i64> %not, %a1
188  ret <4 x i64> %res
189}
190
191define <4 x i64> @test_mm256_avg_epu8(<4 x i64> %a0, <4 x i64> %a1) nounwind {
192; CHECK-LABEL: test_mm256_avg_epu8:
193; CHECK:       # %bb.0:
194; CHECK-NEXT:    vpavgb %ymm1, %ymm0, %ymm0
195; CHECK-NEXT:    ret{{[l|q]}}
196  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
197  %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
198  %res = call <32 x i8> @llvm.x86.avx2.pavg.b(<32 x i8> %arg0, <32 x i8> %arg1)
199  %bc = bitcast <32 x i8> %res to <4 x i64>
200  ret <4 x i64> %bc
201}
202declare <32 x i8> @llvm.x86.avx2.pavg.b(<32 x i8>, <32 x i8>) nounwind readnone
203
204define <4 x i64> @test_mm256_avg_epu16(<4 x i64> %a0, <4 x i64> %a1) nounwind {
205; CHECK-LABEL: test_mm256_avg_epu16:
206; CHECK:       # %bb.0:
207; CHECK-NEXT:    vpavgw %ymm1, %ymm0, %ymm0
208; CHECK-NEXT:    ret{{[l|q]}}
209  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
210  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
211  %res = call <16 x i16> @llvm.x86.avx2.pavg.w(<16 x i16> %arg0, <16 x i16> %arg1)
212  %bc = bitcast <16 x i16> %res to <4 x i64>
213  ret <4 x i64> %bc
214}
215declare <16 x i16> @llvm.x86.avx2.pavg.w(<16 x i16>, <16 x i16>) nounwind readnone
216
217define <4 x i64> @test_mm256_blend_epi16(<4 x i64> %a0, <4 x i64> %a1) {
218; CHECK-LABEL: test_mm256_blend_epi16:
219; CHECK:       # %bb.0:
220; CHECK-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6,7,8],ymm1[9],ymm0[10,11,12,13,14,15]
221; CHECK-NEXT:    ret{{[l|q]}}
222  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
223  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
224  %shuf = shufflevector <16 x i16> %arg0, <16 x i16> %arg1, <16 x i32> <i32 0, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 25, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
225  %res = bitcast <16 x i16> %shuf to <4 x i64>
226  ret <4 x i64> %res
227}
228
229define <2 x i64> @test_mm_blend_epi32(<2 x i64> %a0, <2 x i64> %a1) {
230; CHECK-LABEL: test_mm_blend_epi32:
231; CHECK:       # %bb.0:
232; CHECK-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
233; CHECK-NEXT:    ret{{[l|q]}}
234  %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
235  %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
236  %shuf = shufflevector <4 x i32> %arg0, <4 x i32> %arg1, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
237  %res = bitcast <4 x i32> %shuf to <2 x i64>
238  ret <2 x i64> %res
239}
240
241define <4 x i64> @test_mm256_blend_epi32(<4 x i64> %a0, <4 x i64> %a1) {
242; CHECK-LABEL: test_mm256_blend_epi32:
243; CHECK:       # %bb.0:
244; CHECK-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6,7]
245; CHECK-NEXT:    ret{{[l|q]}}
246  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
247  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
248  %shuf = shufflevector <8 x i32> %arg0, <8 x i32> %arg1, <8 x i32> <i32 8, i32 1, i32 10, i32 3, i32 12, i32 13, i32 6, i32 7>
249  %res = bitcast <8 x i32> %shuf to <4 x i64>
250  ret <4 x i64> %res
251}
252
253define <4 x i64> @test_mm256_blendv_epi8(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> %a2) {
254; CHECK-LABEL: test_mm256_blendv_epi8:
255; CHECK:       # %bb.0:
256; CHECK-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
257; CHECK-NEXT:    ret{{[l|q]}}
258  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
259  %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
260  %arg2 = bitcast <4 x i64> %a2 to <32 x i8>
261  %call = call <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8> %arg0, <32 x i8> %arg1, <32 x i8> %arg2)
262  %res = bitcast <32 x i8> %call to <4 x i64>
263  ret <4 x i64> %res
264}
265declare <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8>, <32 x i8>, <32 x i8>) nounwind readnone
266
267define <2 x i64> @test_mm_broadcastb_epi8(<2 x i64> %a0) {
268; CHECK-LABEL: test_mm_broadcastb_epi8:
269; CHECK:       # %bb.0:
270; CHECK-NEXT:    vpbroadcastb %xmm0, %xmm0
271; CHECK-NEXT:    ret{{[l|q]}}
272  %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
273  %shuf = shufflevector <16 x i8> %arg0, <16 x i8> undef, <16 x i32> zeroinitializer
274  %res = bitcast <16 x i8> %shuf to <2 x i64>
275  ret <2 x i64> %res
276}
277
278define <4 x i64> @test_mm256_broadcastb_epi8(<4 x i64> %a0) {
279; CHECK-LABEL: test_mm256_broadcastb_epi8:
280; CHECK:       # %bb.0:
281; CHECK-NEXT:    vpbroadcastb %xmm0, %ymm0
282; CHECK-NEXT:    ret{{[l|q]}}
283  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
284  %shuf = shufflevector <32 x i8> %arg0, <32 x i8> undef, <32 x i32> zeroinitializer
285  %res = bitcast <32 x i8> %shuf to <4 x i64>
286  ret <4 x i64> %res
287}
288
289define <2 x i64> @test_mm_broadcastd_epi32(<2 x i64> %a0) {
290; CHECK-LABEL: test_mm_broadcastd_epi32:
291; CHECK:       # %bb.0:
292; CHECK-NEXT:    vbroadcastss %xmm0, %xmm0
293; CHECK-NEXT:    ret{{[l|q]}}
294  %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
295  %shuf = shufflevector <4 x i32> %arg0, <4 x i32> undef, <4 x i32> zeroinitializer
296  %res = bitcast <4 x i32> %shuf to <2 x i64>
297  ret <2 x i64> %res
298}
299
300define <4 x i64> @test_mm256_broadcastd_epi32(<4 x i64> %a0) {
301; CHECK-LABEL: test_mm256_broadcastd_epi32:
302; CHECK:       # %bb.0:
303; CHECK-NEXT:    vbroadcastss %xmm0, %ymm0
304; CHECK-NEXT:    ret{{[l|q]}}
305  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
306  %shuf = shufflevector <8 x i32> %arg0, <8 x i32> undef, <8 x i32> zeroinitializer
307  %res = bitcast <8 x i32> %shuf to <4 x i64>
308  ret <4 x i64> %res
309}
310
311define <2 x i64> @test_mm_broadcastq_epi64(<2 x i64> %a0) {
312; CHECK-LABEL: test_mm_broadcastq_epi64:
313; CHECK:       # %bb.0:
314; CHECK-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
315; CHECK-NEXT:    ret{{[l|q]}}
316  %res = shufflevector <2 x i64> %a0, <2 x i64> undef, <2 x i32> zeroinitializer
317  ret <2 x i64> %res
318}
319
320define <4 x i64> @test_mm256_broadcastq_epi64(<4 x i64> %a0) {
321; CHECK-LABEL: test_mm256_broadcastq_epi64:
322; CHECK:       # %bb.0:
323; CHECK-NEXT:    vbroadcastsd %xmm0, %ymm0
324; CHECK-NEXT:    ret{{[l|q]}}
325  %res = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> zeroinitializer
326  ret <4 x i64> %res
327}
328
329define <2 x double> @test_mm_broadcastsd_pd(<2 x double> %a0) {
330; CHECK-LABEL: test_mm_broadcastsd_pd:
331; CHECK:       # %bb.0:
332; CHECK-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
333; CHECK-NEXT:    ret{{[l|q]}}
334  %res = shufflevector <2 x double> %a0, <2 x double> undef, <2 x i32> zeroinitializer
335  ret <2 x double> %res
336}
337
338define <4 x double> @test_mm256_broadcastsd_pd(<4 x double> %a0) {
339; CHECK-LABEL: test_mm256_broadcastsd_pd:
340; CHECK:       # %bb.0:
341; CHECK-NEXT:    vbroadcastsd %xmm0, %ymm0
342; CHECK-NEXT:    ret{{[l|q]}}
343  %res = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> zeroinitializer
344  ret <4 x double> %res
345}
346
347define <4 x i64> @test_mm256_broadcastsi128_si256(<2 x i64> %a0) {
348; CHECK-LABEL: test_mm256_broadcastsi128_si256:
349; CHECK:       # %bb.0:
350; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
351; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
352; CHECK-NEXT:    ret{{[l|q]}}
353  %res = shufflevector <2 x i64> %a0, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
354  ret <4 x i64> %res
355}
356
357define <4 x i64> @test_mm256_broadcastsi128_si256_mem(<2 x i64>* %p0) {
358; X86-LABEL: test_mm256_broadcastsi128_si256_mem:
359; X86:       # %bb.0:
360; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
361; X86-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
362; X86-NEXT:    retl
363;
364; X64-LABEL: test_mm256_broadcastsi128_si256_mem:
365; X64:       # %bb.0:
366; X64-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
367; X64-NEXT:    retq
368  %a0 = load <2 x i64>, <2 x i64>* %p0
369  %res = shufflevector <2 x i64> %a0, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
370  ret <4 x i64> %res
371}
372
373define <4 x float> @test_mm_broadcastss_ps(<4 x float> %a0) {
374; CHECK-LABEL: test_mm_broadcastss_ps:
375; CHECK:       # %bb.0:
376; CHECK-NEXT:    vbroadcastss %xmm0, %xmm0
377; CHECK-NEXT:    ret{{[l|q]}}
378  %res = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> zeroinitializer
379  ret <4 x float> %res
380}
381
382define <8 x float> @test_mm256_broadcastss_ps(<8 x float> %a0) {
383; CHECK-LABEL: test_mm256_broadcastss_ps:
384; CHECK:       # %bb.0:
385; CHECK-NEXT:    vbroadcastss %xmm0, %ymm0
386; CHECK-NEXT:    ret{{[l|q]}}
387  %res = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> zeroinitializer
388  ret <8 x float> %res
389}
390
391define <2 x i64> @test_mm_broadcastw_epi16(<2 x i64> %a0) {
392; CHECK-LABEL: test_mm_broadcastw_epi16:
393; CHECK:       # %bb.0:
394; CHECK-NEXT:    vpbroadcastw %xmm0, %xmm0
395; CHECK-NEXT:    ret{{[l|q]}}
396  %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
397  %shuf = shufflevector <8 x i16> %arg0, <8 x i16> undef, <8 x i32> zeroinitializer
398  %res = bitcast <8 x i16> %shuf to <2 x i64>
399  ret <2 x i64> %res
400}
401
402define <4 x i64> @test_mm256_broadcastw_epi16(<4 x i64> %a0) {
403; CHECK-LABEL: test_mm256_broadcastw_epi16:
404; CHECK:       # %bb.0:
405; CHECK-NEXT:    vpbroadcastw %xmm0, %ymm0
406; CHECK-NEXT:    ret{{[l|q]}}
407  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
408  %shuf = shufflevector <16 x i16> %arg0, <16 x i16> undef, <16 x i32> zeroinitializer
409  %res = bitcast <16 x i16> %shuf to <4 x i64>
410  ret <4 x i64> %res
411}
412
413define <4 x i64> @test_mm256_bslli_epi128(<4 x i64> %a0) {
414; CHECK-LABEL: test_mm256_bslli_epi128:
415; CHECK:       # %bb.0:
416; CHECK-NEXT:    vpslldq {{.*#+}} ymm0 = zero,zero,zero,ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12],zero,zero,zero,ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28]
417; CHECK-NEXT:    ret{{[l|q]}}
418  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
419  %shuf = shufflevector <32 x i8> zeroinitializer, <32 x i8> %arg0, <32 x i32> <i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60>
420  %res = bitcast <32 x i8> %shuf to <4 x i64>
421  ret <4 x i64> %res
422}
423
424define <4 x i64> @test_mm256_bsrli_epi128(<4 x i64> %a0) {
425; CHECK-LABEL: test_mm256_bsrli_epi128:
426; CHECK:       # %bb.0:
427; CHECK-NEXT:    vpsrldq {{.*#+}} ymm0 = ymm0[3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,ymm0[19,20,21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero
428; CHECK-NEXT:    ret{{[l|q]}}
429  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
430  %shuf = shufflevector <32 x i8> %arg0, <32 x i8> zeroinitializer, <32 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50>
431  %res = bitcast <32 x i8> %shuf to <4 x i64>
432  ret <4 x i64> %res
433}
434
435define <4 x i64> @test_mm256_cmpeq_epi8(<4 x i64> %a0, <4 x i64> %a1) nounwind {
436; CHECK-LABEL: test_mm256_cmpeq_epi8:
437; CHECK:       # %bb.0:
438; CHECK-NEXT:    vpcmpeqb %ymm1, %ymm0, %ymm0
439; CHECK-NEXT:    ret{{[l|q]}}
440  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
441  %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
442  %cmp = icmp eq <32 x i8> %arg0, %arg1
443  %res = sext <32 x i1> %cmp to <32 x i8>
444  %bc = bitcast <32 x i8> %res to <4 x i64>
445  ret <4 x i64> %bc
446}
447
448define <4 x i64> @test_mm256_cmpeq_epi16(<4 x i64> %a0, <4 x i64> %a1) nounwind {
449; CHECK-LABEL: test_mm256_cmpeq_epi16:
450; CHECK:       # %bb.0:
451; CHECK-NEXT:    vpcmpeqw %ymm1, %ymm0, %ymm0
452; CHECK-NEXT:    ret{{[l|q]}}
453  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
454  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
455  %cmp = icmp eq <16 x i16> %arg0, %arg1
456  %res = sext <16 x i1> %cmp to <16 x i16>
457  %bc = bitcast <16 x i16> %res to <4 x i64>
458  ret <4 x i64> %bc
459}
460
461define <4 x i64> @test_mm256_cmpeq_epi32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
462; CHECK-LABEL: test_mm256_cmpeq_epi32:
463; CHECK:       # %bb.0:
464; CHECK-NEXT:    vpcmpeqd %ymm1, %ymm0, %ymm0
465; CHECK-NEXT:    ret{{[l|q]}}
466  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
467  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
468  %cmp = icmp eq <8 x i32> %arg0, %arg1
469  %res = sext <8 x i1> %cmp to <8 x i32>
470  %bc = bitcast <8 x i32> %res to <4 x i64>
471  ret <4 x i64> %bc
472}
473
474define <4 x i64> @test_mm256_cmpeq_epi64(<4 x i64> %a0, <4 x i64> %a1) nounwind {
475; CHECK-LABEL: test_mm256_cmpeq_epi64:
476; CHECK:       # %bb.0:
477; CHECK-NEXT:    vpcmpeqq %ymm1, %ymm0, %ymm0
478; CHECK-NEXT:    ret{{[l|q]}}
479  %cmp = icmp eq <4 x i64> %a0, %a1
480  %res = sext <4 x i1> %cmp to <4 x i64>
481  ret <4 x i64> %res
482}
483
484define <4 x i64> @test_mm256_cmpgt_epi8(<4 x i64> %a0, <4 x i64> %a1) nounwind {
485; CHECK-LABEL: test_mm256_cmpgt_epi8:
486; CHECK:       # %bb.0:
487; CHECK-NEXT:    vpcmpgtb %ymm1, %ymm0, %ymm0
488; CHECK-NEXT:    ret{{[l|q]}}
489  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
490  %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
491  %cmp = icmp sgt <32 x i8> %arg0, %arg1
492  %res = sext <32 x i1> %cmp to <32 x i8>
493  %bc = bitcast <32 x i8> %res to <4 x i64>
494  ret <4 x i64> %bc
495}
496
497define <4 x i64> @test_mm256_cmpgt_epi16(<4 x i64> %a0, <4 x i64> %a1) nounwind {
498; CHECK-LABEL: test_mm256_cmpgt_epi16:
499; CHECK:       # %bb.0:
500; CHECK-NEXT:    vpcmpgtw %ymm1, %ymm0, %ymm0
501; CHECK-NEXT:    ret{{[l|q]}}
502  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
503  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
504  %cmp = icmp sgt <16 x i16> %arg0, %arg1
505  %res = sext <16 x i1> %cmp to <16 x i16>
506  %bc = bitcast <16 x i16> %res to <4 x i64>
507  ret <4 x i64> %bc
508}
509
510define <4 x i64> @test_mm256_cmpgt_epi32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
511; CHECK-LABEL: test_mm256_cmpgt_epi32:
512; CHECK:       # %bb.0:
513; CHECK-NEXT:    vpcmpgtd %ymm1, %ymm0, %ymm0
514; CHECK-NEXT:    ret{{[l|q]}}
515  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
516  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
517  %cmp = icmp sgt <8 x i32> %arg0, %arg1
518  %res = sext <8 x i1> %cmp to <8 x i32>
519  %bc = bitcast <8 x i32> %res to <4 x i64>
520  ret <4 x i64> %bc
521}
522
523define <4 x i64> @test_mm256_cmpgt_epi64(<4 x i64> %a0, <4 x i64> %a1) nounwind {
524; CHECK-LABEL: test_mm256_cmpgt_epi64:
525; CHECK:       # %bb.0:
526; CHECK-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm0
527; CHECK-NEXT:    ret{{[l|q]}}
528  %cmp = icmp sgt <4 x i64> %a0, %a1
529  %res = sext <4 x i1> %cmp to <4 x i64>
530  ret <4 x i64> %res
531}
532
533define <4 x i64> @test_mm256_cvtepi8_epi16(<2 x i64> %a0) {
534; CHECK-LABEL: test_mm256_cvtepi8_epi16:
535; CHECK:       # %bb.0:
536; CHECK-NEXT:    vpmovsxbw %xmm0, %ymm0
537; CHECK-NEXT:    ret{{[l|q]}}
538  %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
539  %ext = sext <16 x i8> %arg0 to <16 x i16>
540  %res = bitcast <16 x i16> %ext to <4 x i64>
541  ret <4 x i64> %res
542}
543
544define <4 x i64> @test_mm256_cvtepi8_epi32(<2 x i64> %a0) {
545; CHECK-LABEL: test_mm256_cvtepi8_epi32:
546; CHECK:       # %bb.0:
547; CHECK-NEXT:    vpmovsxbd %xmm0, %ymm0
548; CHECK-NEXT:    ret{{[l|q]}}
549  %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
550  %shuf = shufflevector <16 x i8> %arg0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
551  %ext = sext <8 x i8> %shuf to <8 x i32>
552  %res = bitcast <8 x i32> %ext to <4 x i64>
553  ret <4 x i64> %res
554}
555
556define <4 x i64> @test_mm256_cvtepi8_epi64(<2 x i64> %a0) {
557; CHECK-LABEL: test_mm256_cvtepi8_epi64:
558; CHECK:       # %bb.0:
559; CHECK-NEXT:    vpmovsxbq %xmm0, %ymm0
560; CHECK-NEXT:    ret{{[l|q]}}
561  %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
562  %shuf = shufflevector <16 x i8> %arg0, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
563  %ext = sext <4 x i8> %shuf to <4 x i64>
564  ret <4 x i64> %ext
565}
566
567define <4 x i64> @test_mm256_cvtepi16_epi32(<2 x i64> %a0) {
568; CHECK-LABEL: test_mm256_cvtepi16_epi32:
569; CHECK:       # %bb.0:
570; CHECK-NEXT:    vpmovsxwd %xmm0, %ymm0
571; CHECK-NEXT:    ret{{[l|q]}}
572  %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
573  %ext = sext <8 x i16> %arg0 to <8 x i32>
574  %res = bitcast <8 x i32> %ext to <4 x i64>
575  ret <4 x i64> %res
576}
577
578define <4 x i64> @test_mm256_cvtepi16_epi64(<2 x i64> %a0) {
579; CHECK-LABEL: test_mm256_cvtepi16_epi64:
580; CHECK:       # %bb.0:
581; CHECK-NEXT:    vpmovsxwq %xmm0, %ymm0
582; CHECK-NEXT:    ret{{[l|q]}}
583  %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
584  %shuf = shufflevector <8 x i16> %arg0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
585  %ext = sext <4 x i16> %shuf to <4 x i64>
586  ret <4 x i64> %ext
587}
588
589define <4 x i64> @test_mm256_cvtepi32_epi64(<2 x i64> %a0) {
590; CHECK-LABEL: test_mm256_cvtepi32_epi64:
591; CHECK:       # %bb.0:
592; CHECK-NEXT:    vpmovsxdq %xmm0, %ymm0
593; CHECK-NEXT:    ret{{[l|q]}}
594  %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
595  %ext = sext <4 x i32> %arg0 to <4 x i64>
596  ret <4 x i64> %ext
597}
598
599define <4 x i64> @test_mm256_cvtepu8_epi16(<2 x i64> %a0) {
600; CHECK-LABEL: test_mm256_cvtepu8_epi16:
601; CHECK:       # %bb.0:
602; CHECK-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
603; CHECK-NEXT:    ret{{[l|q]}}
604  %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
605  %ext = zext <16 x i8> %arg0 to <16 x i16>
606  %res = bitcast <16 x i16> %ext to <4 x i64>
607  ret <4 x i64> %res
608}
609
610define <4 x i64> @test_mm256_cvtepu8_epi32(<2 x i64> %a0) {
611; CHECK-LABEL: test_mm256_cvtepu8_epi32:
612; CHECK:       # %bb.0:
613; CHECK-NEXT:    vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
614; CHECK-NEXT:    ret{{[l|q]}}
615  %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
616  %shuf = shufflevector <16 x i8> %arg0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
617  %ext = zext <8 x i8> %shuf to <8 x i32>
618  %res = bitcast <8 x i32> %ext to <4 x i64>
619  ret <4 x i64> %res
620}
621
622define <4 x i64> @test_mm256_cvtepu8_epi64(<2 x i64> %a0) {
623; CHECK-LABEL: test_mm256_cvtepu8_epi64:
624; CHECK:       # %bb.0:
625; CHECK-NEXT:    vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
626; CHECK-NEXT:    ret{{[l|q]}}
627  %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
628  %shuf = shufflevector <16 x i8> %arg0, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
629  %ext = zext <4 x i8> %shuf to <4 x i64>
630  ret <4 x i64> %ext
631}
632
633define <4 x i64> @test_mm256_cvtepu16_epi32(<2 x i64> %a0) {
634; CHECK-LABEL: test_mm256_cvtepu16_epi32:
635; CHECK:       # %bb.0:
636; CHECK-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
637; CHECK-NEXT:    ret{{[l|q]}}
638  %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
639  %ext = zext <8 x i16> %arg0 to <8 x i32>
640  %res = bitcast <8 x i32> %ext to <4 x i64>
641  ret <4 x i64> %res
642}
643
644define <4 x i64> @test_mm256_cvtepu16_epi64(<2 x i64> %a0) {
645; CHECK-LABEL: test_mm256_cvtepu16_epi64:
646; CHECK:       # %bb.0:
647; CHECK-NEXT:    vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
648; CHECK-NEXT:    ret{{[l|q]}}
649  %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
650  %shuf = shufflevector <8 x i16> %arg0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
651  %ext = zext <4 x i16> %shuf to <4 x i64>
652  ret <4 x i64> %ext
653}
654
655define <4 x i64> @test_mm256_cvtepu32_epi64(<2 x i64> %a0) {
656; CHECK-LABEL: test_mm256_cvtepu32_epi64:
657; CHECK:       # %bb.0:
658; CHECK-NEXT:    vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
659; CHECK-NEXT:    ret{{[l|q]}}
660  %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
661  %ext = zext <4 x i32> %arg0 to <4 x i64>
662  ret <4 x i64> %ext
663}
664
665define <2 x i64> @test_mm256_extracti128_si256(<4 x i64> %a0) nounwind {
666; CHECK-LABEL: test_mm256_extracti128_si256:
667; CHECK:       # %bb.0:
668; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
669; CHECK-NEXT:    vzeroupper
670; CHECK-NEXT:    ret{{[l|q]}}
671  %res = shufflevector <4 x i64> %a0, <4 x i64> %a0, <2 x i32> <i32 2, i32 3>
672  ret <2 x i64> %res
673}
674
675define <4 x i64> @test_mm256_hadd_epi16(<4 x i64> %a0, <4 x i64> %a1) {
676; CHECK-LABEL: test_mm256_hadd_epi16:
677; CHECK:       # %bb.0:
678; CHECK-NEXT:    vphaddw %ymm1, %ymm0, %ymm0
679; CHECK-NEXT:    ret{{[l|q]}}
680  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
681  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
682  %res = call <16 x i16> @llvm.x86.avx2.phadd.w(<16 x i16> %arg0, <16 x i16> %arg1)
683  %bc = bitcast <16 x i16> %res to <4 x i64>
684  ret <4 x i64> %bc
685}
686declare <16 x i16> @llvm.x86.avx2.phadd.w(<16 x i16>, <16 x i16>) nounwind readnone
687
688define <4 x i64> @test_mm256_hadd_epi32(<4 x i64> %a0, <4 x i64> %a1) {
689; CHECK-LABEL: test_mm256_hadd_epi32:
690; CHECK:       # %bb.0:
691; CHECK-NEXT:    vphaddd %ymm1, %ymm0, %ymm0
692; CHECK-NEXT:    ret{{[l|q]}}
693  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
694  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
695  %res = call <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32> %arg0, <8 x i32> %arg1)
696  %bc = bitcast <8 x i32> %res to <4 x i64>
697  ret <4 x i64> %bc
698}
699declare <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32>, <8 x i32>) nounwind readnone
700
701define <4 x i64> @test_mm256_hadds_epi16(<4 x i64> %a0, <4 x i64> %a1) {
702; CHECK-LABEL: test_mm256_hadds_epi16:
703; CHECK:       # %bb.0:
704; CHECK-NEXT:    vphaddsw %ymm1, %ymm0, %ymm0
705; CHECK-NEXT:    ret{{[l|q]}}
706  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
707  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
708  %res = call <16 x i16> @llvm.x86.avx2.phadd.sw(<16 x i16> %arg0, <16 x i16> %arg1)
709  %bc = bitcast <16 x i16> %res to <4 x i64>
710  ret <4 x i64> %bc
711}
712declare <16 x i16> @llvm.x86.avx2.phadd.sw(<16 x i16>, <16 x i16>) nounwind readnone
713
714define <4 x i64> @test_mm256_hsub_epi16(<4 x i64> %a0, <4 x i64> %a1) {
715; CHECK-LABEL: test_mm256_hsub_epi16:
716; CHECK:       # %bb.0:
717; CHECK-NEXT:    vphsubw %ymm1, %ymm0, %ymm0
718; CHECK-NEXT:    ret{{[l|q]}}
719  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
720  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
721  %res = call <16 x i16> @llvm.x86.avx2.phsub.w(<16 x i16> %arg0, <16 x i16> %arg1)
722  %bc = bitcast <16 x i16> %res to <4 x i64>
723  ret <4 x i64> %bc
724}
725declare <16 x i16> @llvm.x86.avx2.phsub.w(<16 x i16>, <16 x i16>) nounwind readnone
726
727define <4 x i64> @test_mm256_hsub_epi32(<4 x i64> %a0, <4 x i64> %a1) {
728; CHECK-LABEL: test_mm256_hsub_epi32:
729; CHECK:       # %bb.0:
730; CHECK-NEXT:    vphsubd %ymm1, %ymm0, %ymm0
731; CHECK-NEXT:    ret{{[l|q]}}
732  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
733  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
734  %res = call <8 x i32> @llvm.x86.avx2.phsub.d(<8 x i32> %arg0, <8 x i32> %arg1)
735  %bc = bitcast <8 x i32> %res to <4 x i64>
736  ret <4 x i64> %bc
737}
738declare <8 x i32> @llvm.x86.avx2.phsub.d(<8 x i32>, <8 x i32>) nounwind readnone
739
740define <4 x i64> @test_mm256_hsubs_epi16(<4 x i64> %a0, <4 x i64> %a1) {
741; CHECK-LABEL: test_mm256_hsubs_epi16:
742; CHECK:       # %bb.0:
743; CHECK-NEXT:    vphsubsw %ymm1, %ymm0, %ymm0
744; CHECK-NEXT:    ret{{[l|q]}}
745  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
746  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
747  %res = call <16 x i16> @llvm.x86.avx2.phsub.sw(<16 x i16> %arg0, <16 x i16> %arg1)
748  %bc = bitcast <16 x i16> %res to <4 x i64>
749  ret <4 x i64> %bc
750}
751declare <16 x i16> @llvm.x86.avx2.phsub.sw(<16 x i16>, <16 x i16>) nounwind readnone
752
753define <2 x i64> @test_mm_i32gather_epi32(i32 *%a0, <2 x i64> %a1) {
754; X86-LABEL: test_mm_i32gather_epi32:
755; X86:       # %bb.0:
756; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
757; X86-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
758; X86-NEXT:    vpxor %xmm1, %xmm1, %xmm1
759; X86-NEXT:    vpgatherdd %xmm2, (%eax,%xmm0,2), %xmm1
760; X86-NEXT:    vmovdqa %xmm1, %xmm0
761; X86-NEXT:    retl
762;
763; X64-LABEL: test_mm_i32gather_epi32:
764; X64:       # %bb.0:
765; X64-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
766; X64-NEXT:    vpxor %xmm1, %xmm1, %xmm1
767; X64-NEXT:    vpgatherdd %xmm2, (%rdi,%xmm0,2), %xmm1
768; X64-NEXT:    vmovdqa %xmm1, %xmm0
769; X64-NEXT:    retq
770  %arg0 = bitcast i32 *%a0 to i8*
771  %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
772  %mask = bitcast <2 x i64> <i64 -1, i64 -1> to <4 x i32>
773  %call = call <4 x i32> @llvm.x86.avx2.gather.d.d(<4 x i32> undef, i8* %arg0, <4 x i32> %arg1, <4 x i32> %mask, i8 2)
774  %bc = bitcast <4 x i32> %call to <2 x i64>
775  ret <2 x i64> %bc
776}
777declare <4 x i32> @llvm.x86.avx2.gather.d.d(<4 x i32>, i8*, <4 x i32>, <4 x i32>, i8) nounwind readonly
778
779define <2 x i64> @test_mm_mask_i32gather_epi32(<2 x i64> %a0, i32 *%a1, <2 x i64> %a2, <2 x i64> %a3) {
780; X86-LABEL: test_mm_mask_i32gather_epi32:
781; X86:       # %bb.0:
782; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
783; X86-NEXT:    vpgatherdd %xmm2, (%eax,%xmm1,2), %xmm0
784; X86-NEXT:    retl
785;
786; X64-LABEL: test_mm_mask_i32gather_epi32:
787; X64:       # %bb.0:
788; X64-NEXT:    vpgatherdd %xmm2, (%rdi,%xmm1,2), %xmm0
789; X64-NEXT:    retq
790  %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
791  %arg1 = bitcast i32 *%a1 to i8*
792  %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
793  %arg3 = bitcast <2 x i64> %a3 to <4 x i32>
794  %call = call <4 x i32> @llvm.x86.avx2.gather.d.d(<4 x i32> %arg0, i8* %arg1, <4 x i32> %arg2, <4 x i32> %arg3, i8 2)
795  %bc = bitcast <4 x i32> %call to <2 x i64>
796  ret <2 x i64> %bc
797}
798
799define <4 x i64> @test_mm256_i32gather_epi32(i32 *%a0, <4 x i64> %a1) {
800; X86-LABEL: test_mm256_i32gather_epi32:
801; X86:       # %bb.0:
802; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
803; X86-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
804; X86-NEXT:    vpxor %xmm1, %xmm1, %xmm1
805; X86-NEXT:    vpgatherdd %ymm2, (%eax,%ymm0,2), %ymm1
806; X86-NEXT:    vmovdqa %ymm1, %ymm0
807; X86-NEXT:    retl
808;
809; X64-LABEL: test_mm256_i32gather_epi32:
810; X64:       # %bb.0:
811; X64-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
812; X64-NEXT:    vpxor %xmm1, %xmm1, %xmm1
813; X64-NEXT:    vpgatherdd %ymm2, (%rdi,%ymm0,2), %ymm1
814; X64-NEXT:    vmovdqa %ymm1, %ymm0
815; X64-NEXT:    retq
816  %arg0 = bitcast i32 *%a0 to i8*
817  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
818  %mask = bitcast <4 x i64> <i64 -1, i64 -1, i64 -1, i64 -1> to <8 x i32>
819  %call = call <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> undef, i8* %arg0, <8 x i32> %arg1, <8 x i32> %mask, i8 2)
820  %bc = bitcast <8 x i32> %call to <4 x i64>
821  ret <4 x i64> %bc
822}
823declare <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32>, i8*, <8 x i32>, <8 x i32>, i8) nounwind readonly
824
825define <4 x i64> @test_mm256_mask_i32gather_epi32(<4 x i64> %a0, i32 *%a1, <4 x i64> %a2, <4 x i64> %a3) {
826; X86-LABEL: test_mm256_mask_i32gather_epi32:
827; X86:       # %bb.0:
828; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
829; X86-NEXT:    vpgatherdd %ymm2, (%eax,%ymm1,2), %ymm0
830; X86-NEXT:    retl
831;
832; X64-LABEL: test_mm256_mask_i32gather_epi32:
833; X64:       # %bb.0:
834; X64-NEXT:    vpgatherdd %ymm2, (%rdi,%ymm1,2), %ymm0
835; X64-NEXT:    retq
836  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
837  %arg1 = bitcast i32 *%a1 to i8*
838  %arg2 = bitcast <4 x i64> %a2 to <8 x i32>
839  %arg3 = bitcast <4 x i64> %a3 to <8 x i32>
840  %call = call <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> %arg0, i8* %arg1, <8 x i32> %arg2, <8 x i32> %arg3, i8 2)
841  %bc = bitcast <8 x i32> %call to <4 x i64>
842  ret <4 x i64> %bc
843}
844
845define <2 x i64> @test_mm_i32gather_epi64(i64 *%a0, <2 x i64> %a1) {
846; X86-LABEL: test_mm_i32gather_epi64:
847; X86:       # %bb.0:
848; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
849; X86-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
850; X86-NEXT:    vpxor %xmm1, %xmm1, %xmm1
851; X86-NEXT:    vpgatherdq %xmm2, (%eax,%xmm0,2), %xmm1
852; X86-NEXT:    vmovdqa %xmm1, %xmm0
853; X86-NEXT:    retl
854;
855; X64-LABEL: test_mm_i32gather_epi64:
856; X64:       # %bb.0:
857; X64-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
858; X64-NEXT:    vpxor %xmm1, %xmm1, %xmm1
859; X64-NEXT:    vpgatherdq %xmm2, (%rdi,%xmm0,2), %xmm1
860; X64-NEXT:    vmovdqa %xmm1, %xmm0
861; X64-NEXT:    retq
862  %arg0 = bitcast i64 *%a0 to i8*
863  %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
864  %res = call <2 x i64> @llvm.x86.avx2.gather.d.q(<2 x i64> undef, i8* %arg0, <4 x i32> %arg1, <2 x i64> <i64 -1, i64 -1>, i8 2)
865  ret <2 x i64> %res
866}
867declare <2 x i64> @llvm.x86.avx2.gather.d.q(<2 x i64>, i8*, <4 x i32>, <2 x i64>, i8) nounwind readonly
868
869define <2 x i64> @test_mm_mask_i32gather_epi64(<2 x i64> %a0, i64 *%a1, <2 x i64> %a2, <2 x i64> %a3) {
870; X86-LABEL: test_mm_mask_i32gather_epi64:
871; X86:       # %bb.0:
872; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
873; X86-NEXT:    vpgatherdq %xmm2, (%eax,%xmm1,2), %xmm0
874; X86-NEXT:    retl
875;
876; X64-LABEL: test_mm_mask_i32gather_epi64:
877; X64:       # %bb.0:
878; X64-NEXT:    vpgatherdq %xmm2, (%rdi,%xmm1,2), %xmm0
879; X64-NEXT:    retq
880  %arg1 = bitcast i64 *%a1 to i8*
881  %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
882  %res = call <2 x i64> @llvm.x86.avx2.gather.d.q(<2 x i64> %a0, i8* %arg1, <4 x i32> %arg2, <2 x i64> %a3, i8 2)
883  ret <2 x i64> %res
884}
885
886define <4 x i64> @test_mm256_i32gather_epi64(i64 *%a0, <2 x i64> %a1) {
887; X86-LABEL: test_mm256_i32gather_epi64:
888; X86:       # %bb.0:
889; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
890; X86-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
891; X86-NEXT:    vpxor %xmm1, %xmm1, %xmm1
892; X86-NEXT:    vpgatherdq %ymm2, (%eax,%xmm0,2), %ymm1
893; X86-NEXT:    vmovdqa %ymm1, %ymm0
894; X86-NEXT:    retl
895;
896; X64-LABEL: test_mm256_i32gather_epi64:
897; X64:       # %bb.0:
898; X64-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
899; X64-NEXT:    vpxor %xmm1, %xmm1, %xmm1
900; X64-NEXT:    vpgatherdq %ymm2, (%rdi,%xmm0,2), %ymm1
901; X64-NEXT:    vmovdqa %ymm1, %ymm0
902; X64-NEXT:    retq
903  %arg0 = bitcast i64 *%a0 to i8*
904  %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
905  %res = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8* %arg0, <4 x i32> %arg1, <4 x i64> <i64 -1, i64 -1, i64 -1, i64 -1>, i8 2)
906  ret <4 x i64> %res
907}
908declare <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64>, i8*, <4 x i32>, <4 x i64>, i8) nounwind readonly
909
910define <4 x i64> @test_mm256_mask_i32gather_epi64(<4 x i64> %a0, i64 *%a1, <2 x i64> %a2, <4 x i64> %a3) {
911; X86-LABEL: test_mm256_mask_i32gather_epi64:
912; X86:       # %bb.0:
913; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
914; X86-NEXT:    vpgatherdq %ymm2, (%eax,%xmm1,2), %ymm0
915; X86-NEXT:    retl
916;
917; X64-LABEL: test_mm256_mask_i32gather_epi64:
918; X64:       # %bb.0:
919; X64-NEXT:    vpgatherdq %ymm2, (%rdi,%xmm1,2), %ymm0
920; X64-NEXT:    retq
921  %arg1 = bitcast i64 *%a1 to i8*
922  %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
923  %res = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> %a0, i8* %arg1, <4 x i32> %arg2, <4 x i64> %a3, i8 2)
924  ret <4 x i64> %res
925}
926
927define <2 x double> @test_mm_i32gather_pd(double *%a0, <2 x i64> %a1) {
928; X86-LABEL: test_mm_i32gather_pd:
929; X86:       # %bb.0:
930; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
931; X86-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
932; X86-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
933; X86-NEXT:    vgatherdpd %xmm2, (%eax,%xmm0,2), %xmm1
934; X86-NEXT:    vmovapd %xmm1, %xmm0
935; X86-NEXT:    retl
936;
937; X64-LABEL: test_mm_i32gather_pd:
938; X64:       # %bb.0:
939; X64-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
940; X64-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
941; X64-NEXT:    vgatherdpd %xmm2, (%rdi,%xmm0,2), %xmm1
942; X64-NEXT:    vmovapd %xmm1, %xmm0
943; X64-NEXT:    retq
944  %arg0 = bitcast double *%a0 to i8*
945  %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
946  %cmp = fcmp oeq <2 x double> zeroinitializer, zeroinitializer
947  %sext = sext <2 x i1> %cmp to <2 x i64>
948  %mask = bitcast <2 x i64> %sext to <2 x double>
949  %res = call <2 x double> @llvm.x86.avx2.gather.d.pd(<2 x double> undef, i8* %arg0, <4 x i32> %arg1, <2 x double> %mask, i8 2)
950  ret <2 x double> %res
951}
952declare <2 x double> @llvm.x86.avx2.gather.d.pd(<2 x double>, i8*, <4 x i32>, <2 x double>, i8) nounwind readonly
953
954define <2 x double> @test_mm_mask_i32gather_pd(<2 x double> %a0, double *%a1, <2 x i64> %a2, <2 x double> %a3) {
955; X86-LABEL: test_mm_mask_i32gather_pd:
956; X86:       # %bb.0:
957; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
958; X86-NEXT:    vgatherdpd %xmm2, (%eax,%xmm1,2), %xmm0
959; X86-NEXT:    retl
960;
961; X64-LABEL: test_mm_mask_i32gather_pd:
962; X64:       # %bb.0:
963; X64-NEXT:    vgatherdpd %xmm2, (%rdi,%xmm1,2), %xmm0
964; X64-NEXT:    retq
965  %arg1 = bitcast double *%a1 to i8*
966  %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
967  %res = call <2 x double> @llvm.x86.avx2.gather.d.pd(<2 x double> %a0, i8* %arg1, <4 x i32> %arg2, <2 x double> %a3, i8 2)
968  ret <2 x double> %res
969}
970
971define <4 x double> @test_mm256_i32gather_pd(double *%a0, <2 x i64> %a1) {
972; X86-LABEL: test_mm256_i32gather_pd:
973; X86:       # %bb.0:
974; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
975; X86-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
976; X86-NEXT:    vcmpeqpd %ymm1, %ymm1, %ymm2
977; X86-NEXT:    vgatherdpd %ymm2, (%eax,%xmm0,2), %ymm1
978; X86-NEXT:    vmovapd %ymm1, %ymm0
979; X86-NEXT:    retl
980;
981; X64-LABEL: test_mm256_i32gather_pd:
982; X64:       # %bb.0:
983; X64-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
984; X64-NEXT:    vcmpeqpd %ymm1, %ymm1, %ymm2
985; X64-NEXT:    vgatherdpd %ymm2, (%rdi,%xmm0,2), %ymm1
986; X64-NEXT:    vmovapd %ymm1, %ymm0
987; X64-NEXT:    retq
988  %arg0 = bitcast double *%a0 to i8*
989  %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
990  %mask = call <4 x double> @llvm.x86.avx.cmp.pd.256(<4 x double> zeroinitializer, <4 x double> zeroinitializer, i8 0)
991  %res = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8* %arg0, <4 x i32> %arg1, <4 x double> %mask, i8 2)
992  ret <4 x double> %res
993}
994declare <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double>, i8*, <4 x i32>, <4 x double>, i8) nounwind readonly
995
996define <4 x double> @test_mm256_mask_i32gather_pd(<4 x double> %a0, double *%a1, <2 x i64> %a2, <4 x double> %a3) {
997; X86-LABEL: test_mm256_mask_i32gather_pd:
998; X86:       # %bb.0:
999; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1000; X86-NEXT:    vgatherdpd %ymm2, (%eax,%xmm1,2), %ymm0
1001; X86-NEXT:    retl
1002;
1003; X64-LABEL: test_mm256_mask_i32gather_pd:
1004; X64:       # %bb.0:
1005; X64-NEXT:    vgatherdpd %ymm2, (%rdi,%xmm1,2), %ymm0
1006; X64-NEXT:    retq
1007  %arg1 = bitcast double *%a1 to i8*
1008  %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
1009  %res = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> %a0, i8* %arg1, <4 x i32> %arg2, <4 x double> %a3, i8 2)
1010  ret <4 x double> %res
1011}
1012
1013define <4 x float> @test_mm_i32gather_ps(float *%a0, <2 x i64> %a1) {
1014; X86-LABEL: test_mm_i32gather_ps:
1015; X86:       # %bb.0:
1016; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1017; X86-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1018; X86-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1019; X86-NEXT:    vgatherdps %xmm2, (%eax,%xmm0,2), %xmm1
1020; X86-NEXT:    vmovaps %xmm1, %xmm0
1021; X86-NEXT:    retl
1022;
1023; X64-LABEL: test_mm_i32gather_ps:
1024; X64:       # %bb.0:
1025; X64-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1026; X64-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1027; X64-NEXT:    vgatherdps %xmm2, (%rdi,%xmm0,2), %xmm1
1028; X64-NEXT:    vmovaps %xmm1, %xmm0
1029; X64-NEXT:    retq
1030  %arg0 = bitcast float *%a0 to i8*
1031  %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
1032  %cmp = fcmp oeq <4 x float> zeroinitializer, zeroinitializer
1033  %sext = sext <4 x i1> %cmp to <4 x i32>
1034  %mask = bitcast <4 x i32> %sext to <4 x float>
1035  %call = call <4 x float> @llvm.x86.avx2.gather.d.ps(<4 x float> undef, i8* %arg0, <4 x i32> %arg1, <4 x float> %mask, i8 2)
1036  ret <4 x float> %call
1037}
1038declare <4 x float> @llvm.x86.avx2.gather.d.ps(<4 x float>, i8*, <4 x i32>, <4 x float>, i8) nounwind readonly
1039
1040define <4 x float> @test_mm_mask_i32gather_ps(<4 x float> %a0, float *%a1, <2 x i64> %a2, <4 x float> %a3) {
1041; X86-LABEL: test_mm_mask_i32gather_ps:
1042; X86:       # %bb.0:
1043; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1044; X86-NEXT:    vgatherdps %xmm2, (%eax,%xmm1,2), %xmm0
1045; X86-NEXT:    retl
1046;
1047; X64-LABEL: test_mm_mask_i32gather_ps:
1048; X64:       # %bb.0:
1049; X64-NEXT:    vgatherdps %xmm2, (%rdi,%xmm1,2), %xmm0
1050; X64-NEXT:    retq
1051  %arg1 = bitcast float *%a1 to i8*
1052  %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
1053  %call = call <4 x float> @llvm.x86.avx2.gather.d.ps(<4 x float> %a0, i8* %arg1, <4 x i32> %arg2, <4 x float> %a3, i8 2)
1054  ret <4 x float> %call
1055}
1056
1057define <8 x float> @test_mm256_i32gather_ps(float *%a0, <4 x i64> %a1) {
1058; X86-LABEL: test_mm256_i32gather_ps:
1059; X86:       # %bb.0:
1060; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1061; X86-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1062; X86-NEXT:    vcmpeqps %ymm1, %ymm1, %ymm2
1063; X86-NEXT:    vgatherdps %ymm2, (%eax,%ymm0,2), %ymm1
1064; X86-NEXT:    vmovaps %ymm1, %ymm0
1065; X86-NEXT:    retl
1066;
1067; X64-LABEL: test_mm256_i32gather_ps:
1068; X64:       # %bb.0:
1069; X64-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1070; X64-NEXT:    vcmpeqps %ymm1, %ymm1, %ymm2
1071; X64-NEXT:    vgatherdps %ymm2, (%rdi,%ymm0,2), %ymm1
1072; X64-NEXT:    vmovaps %ymm1, %ymm0
1073; X64-NEXT:    retq
1074  %arg0 = bitcast float *%a0 to i8*
1075  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1076  %mask = call <8 x float> @llvm.x86.avx.cmp.ps.256(<8 x float> zeroinitializer, <8 x float> zeroinitializer, i8 0)
1077  %call = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> undef, i8* %arg0, <8 x i32> %arg1, <8 x float> %mask, i8 2)
1078  ret <8 x float> %call
1079}
1080declare <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float>, i8*, <8 x i32>, <8 x float>, i8) nounwind readonly
1081
1082define <8 x float> @test_mm256_mask_i32gather_ps(<8 x float> %a0, float *%a1, <4 x i64> %a2, <8 x float> %a3) {
1083; X86-LABEL: test_mm256_mask_i32gather_ps:
1084; X86:       # %bb.0:
1085; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1086; X86-NEXT:    vgatherdps %ymm2, (%eax,%ymm1,2), %ymm0
1087; X86-NEXT:    retl
1088;
1089; X64-LABEL: test_mm256_mask_i32gather_ps:
1090; X64:       # %bb.0:
1091; X64-NEXT:    vgatherdps %ymm2, (%rdi,%ymm1,2), %ymm0
1092; X64-NEXT:    retq
1093  %arg1 = bitcast float *%a1 to i8*
1094  %arg2 = bitcast <4 x i64> %a2 to <8 x i32>
1095  %call = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> %a0, i8* %arg1, <8 x i32> %arg2, <8 x float> %a3, i8 2)
1096  ret <8 x float> %call
1097}
1098
1099define <2 x i64> @test_mm_i64gather_epi32(i32 *%a0, <2 x i64> %a1) {
1100; X86-LABEL: test_mm_i64gather_epi32:
1101; X86:       # %bb.0:
1102; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1103; X86-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1104; X86-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1105; X86-NEXT:    vpgatherqd %xmm2, (%eax,%xmm0,2), %xmm1
1106; X86-NEXT:    vmovdqa %xmm1, %xmm0
1107; X86-NEXT:    retl
1108;
1109; X64-LABEL: test_mm_i64gather_epi32:
1110; X64:       # %bb.0:
1111; X64-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1112; X64-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1113; X64-NEXT:    vpgatherqd %xmm2, (%rdi,%xmm0,2), %xmm1
1114; X64-NEXT:    vmovdqa %xmm1, %xmm0
1115; X64-NEXT:    retq
1116  %arg0 = bitcast i32 *%a0 to i8*
1117  %mask = bitcast <2 x i64> <i64 -1, i64 -1> to <4 x i32>
1118  %call = call <4 x i32> @llvm.x86.avx2.gather.q.d(<4 x i32> undef, i8* %arg0, <2 x i64> %a1, <4 x i32> %mask, i8 2)
1119  %bc = bitcast <4 x i32> %call to <2 x i64>
1120  ret <2 x i64> %bc
1121}
1122declare <4 x i32> @llvm.x86.avx2.gather.q.d(<4 x i32>, i8*, <2 x i64>, <4 x i32>, i8) nounwind readonly
1123
1124define <2 x i64> @test_mm_mask_i64gather_epi32(<2 x i64> %a0, i32 *%a1, <2 x i64> %a2, <2 x i64> %a3) {
1125; X86-LABEL: test_mm_mask_i64gather_epi32:
1126; X86:       # %bb.0:
1127; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1128; X86-NEXT:    vpgatherqd %xmm2, (%eax,%xmm1,2), %xmm0
1129; X86-NEXT:    retl
1130;
1131; X64-LABEL: test_mm_mask_i64gather_epi32:
1132; X64:       # %bb.0:
1133; X64-NEXT:    vpgatherqd %xmm2, (%rdi,%xmm1,2), %xmm0
1134; X64-NEXT:    retq
1135  %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
1136  %arg1 = bitcast i32 *%a1 to i8*
1137  %arg3 = bitcast <2 x i64> %a3 to <4 x i32>
1138  %call = call <4 x i32> @llvm.x86.avx2.gather.q.d(<4 x i32> %arg0, i8* %arg1, <2 x i64> %a2, <4 x i32> %arg3, i8 2)
1139  %bc = bitcast <4 x i32> %call to <2 x i64>
1140  ret <2 x i64> %bc
1141}
1142
1143define <2 x i64> @test_mm256_i64gather_epi32(i32 *%a0, <4 x i64> %a1) {
1144; X86-LABEL: test_mm256_i64gather_epi32:
1145; X86:       # %bb.0:
1146; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1147; X86-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1148; X86-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1149; X86-NEXT:    vpgatherqd %xmm2, (%eax,%ymm0,2), %xmm1
1150; X86-NEXT:    vmovdqa %xmm1, %xmm0
1151; X86-NEXT:    vzeroupper
1152; X86-NEXT:    retl
1153;
1154; X64-LABEL: test_mm256_i64gather_epi32:
1155; X64:       # %bb.0:
1156; X64-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1157; X64-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1158; X64-NEXT:    vpgatherqd %xmm2, (%rdi,%ymm0,2), %xmm1
1159; X64-NEXT:    vmovdqa %xmm1, %xmm0
1160; X64-NEXT:    vzeroupper
1161; X64-NEXT:    retq
1162  %arg0 = bitcast i32 *%a0 to i8*
1163  %mask = bitcast <2 x i64> <i64 -1, i64 -1> to <4 x i32>
1164  %call = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8* %arg0, <4 x i64> %a1, <4 x i32> %mask, i8 2)
1165  %bc = bitcast <4 x i32> %call to <2 x i64>
1166  ret <2 x i64> %bc
1167}
1168declare <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32>, i8*, <4 x i64>, <4 x i32>, i8) nounwind readonly
1169
1170define <2 x i64> @test_mm256_mask_i64gather_epi32(<2 x i64> %a0, i32 *%a1, <4 x i64> %a2, <2 x i64> %a3) {
1171; X86-LABEL: test_mm256_mask_i64gather_epi32:
1172; X86:       # %bb.0:
1173; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1174; X86-NEXT:    vpgatherqd %xmm2, (%eax,%ymm1,2), %xmm0
1175; X86-NEXT:    vzeroupper
1176; X86-NEXT:    retl
1177;
1178; X64-LABEL: test_mm256_mask_i64gather_epi32:
1179; X64:       # %bb.0:
1180; X64-NEXT:    vpgatherqd %xmm2, (%rdi,%ymm1,2), %xmm0
1181; X64-NEXT:    vzeroupper
1182; X64-NEXT:    retq
1183  %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
1184  %arg1 = bitcast i32 *%a1 to i8*
1185  %arg3 = bitcast <2 x i64> %a3 to <4 x i32>
1186  %call = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> %arg0, i8* %arg1, <4 x i64> %a2, <4 x i32> %arg3, i8 2)
1187  %bc = bitcast <4 x i32> %call to <2 x i64>
1188  ret <2 x i64> %bc
1189}
1190
1191define <2 x i64> @test_mm_i64gather_epi64(i64 *%a0, <2 x i64> %a1) {
1192; X86-LABEL: test_mm_i64gather_epi64:
1193; X86:       # %bb.0:
1194; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1195; X86-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1196; X86-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1197; X86-NEXT:    vpgatherqq %xmm2, (%eax,%xmm0,2), %xmm1
1198; X86-NEXT:    vmovdqa %xmm1, %xmm0
1199; X86-NEXT:    retl
1200;
1201; X64-LABEL: test_mm_i64gather_epi64:
1202; X64:       # %bb.0:
1203; X64-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1204; X64-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1205; X64-NEXT:    vpgatherqq %xmm2, (%rdi,%xmm0,2), %xmm1
1206; X64-NEXT:    vmovdqa %xmm1, %xmm0
1207; X64-NEXT:    retq
1208  %arg0 = bitcast i64 *%a0 to i8*
1209  %call = call <2 x i64> @llvm.x86.avx2.gather.q.q(<2 x i64> undef, i8* %arg0, <2 x i64> %a1, <2 x i64> <i64 -1, i64 -1>, i8 2)
1210  ret <2 x i64> %call
1211}
1212declare <2 x i64> @llvm.x86.avx2.gather.q.q(<2 x i64>, i8*, <2 x i64>, <2 x i64>, i8) nounwind readonly
1213
1214define <2 x i64> @test_mm_mask_i64gather_epi64(<2 x i64> %a0, i64 *%a1, <2 x i64> %a2, <2 x i64> %a3) {
1215; X86-LABEL: test_mm_mask_i64gather_epi64:
1216; X86:       # %bb.0:
1217; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1218; X86-NEXT:    vpgatherqq %xmm2, (%eax,%xmm1,2), %xmm0
1219; X86-NEXT:    retl
1220;
1221; X64-LABEL: test_mm_mask_i64gather_epi64:
1222; X64:       # %bb.0:
1223; X64-NEXT:    vpgatherqq %xmm2, (%rdi,%xmm1,2), %xmm0
1224; X64-NEXT:    retq
1225  %arg1 = bitcast i64 *%a1 to i8*
1226  %call = call <2 x i64> @llvm.x86.avx2.gather.q.q(<2 x i64> %a0, i8* %arg1, <2 x i64> %a2, <2 x i64> %a3, i8 2)
1227  ret <2 x i64> %call
1228}
1229
1230define <4 x i64> @test_mm256_i64gather_epi64(i64 *%a0, <4 x i64> %a1) {
1231; X86-LABEL: test_mm256_i64gather_epi64:
1232; X86:       # %bb.0:
1233; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1234; X86-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
1235; X86-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1236; X86-NEXT:    vpgatherqq %ymm2, (%eax,%ymm0,2), %ymm1
1237; X86-NEXT:    vmovdqa %ymm1, %ymm0
1238; X86-NEXT:    retl
1239;
1240; X64-LABEL: test_mm256_i64gather_epi64:
1241; X64:       # %bb.0:
1242; X64-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
1243; X64-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1244; X64-NEXT:    vpgatherqq %ymm2, (%rdi,%ymm0,2), %ymm1
1245; X64-NEXT:    vmovdqa %ymm1, %ymm0
1246; X64-NEXT:    retq
1247  %arg0 = bitcast i64 *%a0 to i8*
1248  %call = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8* %arg0, <4 x i64> %a1, <4 x i64> <i64 -1, i64 -1, i64 -1, i64 -1>, i8 2)
1249  ret <4 x i64> %call
1250}
1251declare <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64>, i8*, <4 x i64>, <4 x i64>, i8) nounwind readonly
1252
1253define <4 x i64> @test_mm256_mask_i64gather_epi64(<4 x i64> %a0, i64 *%a1, <4 x i64> %a2, <4 x i64> %a3) {
1254; X86-LABEL: test_mm256_mask_i64gather_epi64:
1255; X86:       # %bb.0:
1256; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1257; X86-NEXT:    vpgatherqq %ymm2, (%eax,%ymm1,2), %ymm0
1258; X86-NEXT:    retl
1259;
1260; X64-LABEL: test_mm256_mask_i64gather_epi64:
1261; X64:       # %bb.0:
1262; X64-NEXT:    vpgatherqq %ymm2, (%rdi,%ymm1,2), %ymm0
1263; X64-NEXT:    retq
1264  %arg1 = bitcast i64 *%a1 to i8*
1265  %call = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> %a0, i8* %arg1, <4 x i64> %a2, <4 x i64> %a3, i8 2)
1266  ret <4 x i64> %call
1267}
1268
1269define <2 x double> @test_mm_i64gather_pd(double *%a0, <2 x i64> %a1) {
1270; X86-LABEL: test_mm_i64gather_pd:
1271; X86:       # %bb.0:
1272; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1273; X86-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1274; X86-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
1275; X86-NEXT:    vgatherqpd %xmm2, (%eax,%xmm0,2), %xmm1
1276; X86-NEXT:    vmovapd %xmm1, %xmm0
1277; X86-NEXT:    retl
1278;
1279; X64-LABEL: test_mm_i64gather_pd:
1280; X64:       # %bb.0:
1281; X64-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1282; X64-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
1283; X64-NEXT:    vgatherqpd %xmm2, (%rdi,%xmm0,2), %xmm1
1284; X64-NEXT:    vmovapd %xmm1, %xmm0
1285; X64-NEXT:    retq
1286  %arg0 = bitcast double *%a0 to i8*
1287  %cmp = fcmp oeq <2 x double> zeroinitializer, zeroinitializer
1288  %sext = sext <2 x i1> %cmp to <2 x i64>
1289  %mask = bitcast <2 x i64> %sext to <2 x double>
1290  %call = call <2 x double> @llvm.x86.avx2.gather.q.pd(<2 x double> undef, i8* %arg0, <2 x i64> %a1, <2 x double> %mask, i8 2)
1291  ret <2 x double> %call
1292}
1293declare <2 x double> @llvm.x86.avx2.gather.q.pd(<2 x double>, i8*, <2 x i64>, <2 x double>, i8) nounwind readonly
1294
1295define <2 x double> @test_mm_mask_i64gather_pd(<2 x double> %a0, double *%a1, <2 x i64> %a2, <2 x double> %a3) {
1296; X86-LABEL: test_mm_mask_i64gather_pd:
1297; X86:       # %bb.0:
1298; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1299; X86-NEXT:    vgatherqpd %xmm2, (%eax,%xmm1,2), %xmm0
1300; X86-NEXT:    retl
1301;
1302; X64-LABEL: test_mm_mask_i64gather_pd:
1303; X64:       # %bb.0:
1304; X64-NEXT:    vgatherqpd %xmm2, (%rdi,%xmm1,2), %xmm0
1305; X64-NEXT:    retq
1306  %arg1 = bitcast double *%a1 to i8*
1307  %call = call <2 x double> @llvm.x86.avx2.gather.q.pd(<2 x double> %a0, i8* %arg1, <2 x i64> %a2, <2 x double> %a3, i8 2)
1308  ret <2 x double> %call
1309}
1310
1311define <4 x double> @test_mm256_i64gather_pd(double *%a0, <4 x i64> %a1) {
1312; X86-LABEL: test_mm256_i64gather_pd:
1313; X86:       # %bb.0:
1314; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1315; X86-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
1316; X86-NEXT:    vcmpeqpd %ymm1, %ymm1, %ymm2
1317; X86-NEXT:    vgatherqpd %ymm2, (%eax,%ymm0,2), %ymm1
1318; X86-NEXT:    vmovapd %ymm1, %ymm0
1319; X86-NEXT:    retl
1320;
1321; X64-LABEL: test_mm256_i64gather_pd:
1322; X64:       # %bb.0:
1323; X64-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
1324; X64-NEXT:    vcmpeqpd %ymm1, %ymm1, %ymm2
1325; X64-NEXT:    vgatherqpd %ymm2, (%rdi,%ymm0,2), %ymm1
1326; X64-NEXT:    vmovapd %ymm1, %ymm0
1327; X64-NEXT:    retq
1328  %arg0 = bitcast double *%a0 to i8*
1329  %mask = call <4 x double> @llvm.x86.avx.cmp.pd.256(<4 x double> zeroinitializer, <4 x double> zeroinitializer, i8 0)
1330  %call = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8* %arg0, <4 x i64> %a1, <4 x double> %mask, i8 2)
1331  ret <4 x double> %call
1332}
1333declare <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double>, i8*, <4 x i64>, <4 x double>, i8) nounwind readonly
1334
1335define <4 x double> @test_mm256_mask_i64gather_pd(<4 x double> %a0, i64 *%a1, <4 x i64> %a2, <4 x double> %a3) {
1336; X86-LABEL: test_mm256_mask_i64gather_pd:
1337; X86:       # %bb.0:
1338; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1339; X86-NEXT:    vgatherqpd %ymm2, (%eax,%ymm1,2), %ymm0
1340; X86-NEXT:    retl
1341;
1342; X64-LABEL: test_mm256_mask_i64gather_pd:
1343; X64:       # %bb.0:
1344; X64-NEXT:    vgatherqpd %ymm2, (%rdi,%ymm1,2), %ymm0
1345; X64-NEXT:    retq
1346  %arg1 = bitcast i64 *%a1 to i8*
1347  %call = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> %a0, i8* %arg1, <4 x i64> %a2, <4 x double> %a3, i8 2)
1348  ret <4 x double> %call
1349}
1350
1351define <4 x float> @test_mm_i64gather_ps(float *%a0, <2 x i64> %a1) {
1352; X86-LABEL: test_mm_i64gather_ps:
1353; X86:       # %bb.0:
1354; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1355; X86-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1356; X86-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1357; X86-NEXT:    vgatherqps %xmm2, (%eax,%xmm0,2), %xmm1
1358; X86-NEXT:    vmovaps %xmm1, %xmm0
1359; X86-NEXT:    retl
1360;
1361; X64-LABEL: test_mm_i64gather_ps:
1362; X64:       # %bb.0:
1363; X64-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1364; X64-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1365; X64-NEXT:    vgatherqps %xmm2, (%rdi,%xmm0,2), %xmm1
1366; X64-NEXT:    vmovaps %xmm1, %xmm0
1367; X64-NEXT:    retq
1368  %arg0 = bitcast float *%a0 to i8*
1369  %cmp = fcmp oeq <4 x float> zeroinitializer, zeroinitializer
1370  %sext = sext <4 x i1> %cmp to <4 x i32>
1371  %mask = bitcast <4 x i32> %sext to <4 x float>
1372  %call = call <4 x float> @llvm.x86.avx2.gather.q.ps(<4 x float> undef, i8* %arg0, <2 x i64> %a1, <4 x float> %mask, i8 2)
1373  ret <4 x float> %call
1374}
1375declare <4 x float> @llvm.x86.avx2.gather.q.ps(<4 x float>, i8*, <2 x i64>, <4 x float>, i8) nounwind readonly
1376
1377define <4 x float> @test_mm_mask_i64gather_ps(<4 x float> %a0, float *%a1, <2 x i64> %a2, <4 x float> %a3) {
1378; X86-LABEL: test_mm_mask_i64gather_ps:
1379; X86:       # %bb.0:
1380; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1381; X86-NEXT:    vgatherqps %xmm2, (%eax,%xmm1,2), %xmm0
1382; X86-NEXT:    retl
1383;
1384; X64-LABEL: test_mm_mask_i64gather_ps:
1385; X64:       # %bb.0:
1386; X64-NEXT:    vgatherqps %xmm2, (%rdi,%xmm1,2), %xmm0
1387; X64-NEXT:    retq
1388  %arg1 = bitcast float *%a1 to i8*
1389  %call = call <4 x float> @llvm.x86.avx2.gather.q.ps(<4 x float> %a0, i8* %arg1, <2 x i64> %a2, <4 x float> %a3, i8 2)
1390  ret <4 x float> %call
1391}
1392
1393define <4 x float> @test_mm256_i64gather_ps(float *%a0, <4 x i64> %a1) {
1394; X86-LABEL: test_mm256_i64gather_ps:
1395; X86:       # %bb.0:
1396; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1397; X86-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1398; X86-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1399; X86-NEXT:    vgatherqps %xmm2, (%eax,%ymm0,2), %xmm1
1400; X86-NEXT:    vmovaps %xmm1, %xmm0
1401; X86-NEXT:    vzeroupper
1402; X86-NEXT:    retl
1403;
1404; X64-LABEL: test_mm256_i64gather_ps:
1405; X64:       # %bb.0:
1406; X64-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1407; X64-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1408; X64-NEXT:    vgatherqps %xmm2, (%rdi,%ymm0,2), %xmm1
1409; X64-NEXT:    vmovaps %xmm1, %xmm0
1410; X64-NEXT:    vzeroupper
1411; X64-NEXT:    retq
1412  %arg0 = bitcast float *%a0 to i8*
1413  %cmp = fcmp oeq <4 x float> zeroinitializer, zeroinitializer
1414  %sext = sext <4 x i1> %cmp to <4 x i32>
1415  %mask = bitcast <4 x i32> %sext to <4 x float>
1416  %call = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8* %arg0, <4 x i64> %a1, <4 x float> %mask, i8 2)
1417  ret <4 x float> %call
1418}
1419declare <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float>, i8*, <4 x i64>, <4 x float>, i8) nounwind readonly
1420
1421define <4 x float> @test_mm256_mask_i64gather_ps(<4 x float> %a0, float *%a1, <4 x i64> %a2, <4 x float> %a3) {
1422; X86-LABEL: test_mm256_mask_i64gather_ps:
1423; X86:       # %bb.0:
1424; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1425; X86-NEXT:    vgatherqps %xmm2, (%eax,%ymm1,2), %xmm0
1426; X86-NEXT:    vzeroupper
1427; X86-NEXT:    retl
1428;
1429; X64-LABEL: test_mm256_mask_i64gather_ps:
1430; X64:       # %bb.0:
1431; X64-NEXT:    vgatherqps %xmm2, (%rdi,%ymm1,2), %xmm0
1432; X64-NEXT:    vzeroupper
1433; X64-NEXT:    retq
1434  %arg1 = bitcast float *%a1 to i8*
1435  %call = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> %a0, i8* %arg1, <4 x i64> %a2, <4 x float> %a3, i8 2)
1436  ret <4 x float> %call
1437}
1438
1439define <4 x i64> @test0_mm256_inserti128_si256(<4 x i64> %a0, <2 x i64> %a1) nounwind {
1440; CHECK-LABEL: test0_mm256_inserti128_si256:
1441; CHECK:       # %bb.0:
1442; CHECK-NEXT:    # kill: def $xmm1 killed $xmm1 def $ymm1
1443; CHECK-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
1444; CHECK-NEXT:    ret{{[l|q]}}
1445  %ext = shufflevector <2 x i64> %a1, <2 x i64> %a1, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
1446  %res = shufflevector <4 x i64> %a0, <4 x i64> %ext, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
1447  ret <4 x i64> %res
1448}
1449
1450define <4 x i64> @test1_mm256_inserti128_si256(<4 x i64> %a0, <2 x i64> %a1) nounwind {
1451; CHECK-LABEL: test1_mm256_inserti128_si256:
1452; CHECK:       # %bb.0:
1453; CHECK-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1454; CHECK-NEXT:    ret{{[l|q]}}
1455  %ext = shufflevector <2 x i64> %a1, <2 x i64> %a1, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
1456  %res = shufflevector <4 x i64> %a0, <4 x i64> %ext, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
1457  ret <4 x i64> %res
1458}
1459
1460define <4 x i64> @test_mm256_madd_epi16(<4 x i64> %a0, <4 x i64> %a1) {
1461; CHECK-LABEL: test_mm256_madd_epi16:
1462; CHECK:       # %bb.0:
1463; CHECK-NEXT:    vpmaddwd %ymm1, %ymm0, %ymm0
1464; CHECK-NEXT:    ret{{[l|q]}}
1465  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
1466  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
1467  %res = call <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16> %arg0, <16 x i16> %arg1)
1468  %bc = bitcast <8 x i32> %res to <4 x i64>
1469  ret <4 x i64> %bc
1470}
1471declare <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16>, <16 x i16>) nounwind readnone
1472
1473define <4 x i64> @test_mm256_maddubs_epi16(<4 x i64> %a0, <4 x i64> %a1) {
1474; CHECK-LABEL: test_mm256_maddubs_epi16:
1475; CHECK:       # %bb.0:
1476; CHECK-NEXT:    vpmaddubsw %ymm1, %ymm0, %ymm0
1477; CHECK-NEXT:    ret{{[l|q]}}
1478  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
1479  %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
1480  %res = call <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8> %arg0, <32 x i8> %arg1)
1481  %bc = bitcast <16 x i16> %res to <4 x i64>
1482  ret <4 x i64> %bc
1483}
1484declare <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8>, <32 x i8>) nounwind readnone
1485
1486define <2 x i64> @test_mm_maskload_epi32(i32* %a0, <2 x i64> %a1) nounwind {
1487; X86-LABEL: test_mm_maskload_epi32:
1488; X86:       # %bb.0:
1489; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1490; X86-NEXT:    vpmaskmovd (%eax), %xmm0, %xmm0
1491; X86-NEXT:    retl
1492;
1493; X64-LABEL: test_mm_maskload_epi32:
1494; X64:       # %bb.0:
1495; X64-NEXT:    vpmaskmovd (%rdi), %xmm0, %xmm0
1496; X64-NEXT:    retq
1497  %arg0 = bitcast i32* %a0 to i8*
1498  %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
1499  %call = call <4 x i32> @llvm.x86.avx2.maskload.d(i8* %arg0, <4 x i32> %arg1)
1500  %bc = bitcast <4 x i32> %call to <2 x i64>
1501  ret <2 x i64> %bc
1502}
1503declare <4 x i32> @llvm.x86.avx2.maskload.d(i8*, <4 x i32>) nounwind readonly
1504
1505define <4 x i64> @test_mm256_maskload_epi32(i32* %a0, <4 x i64> %a1) nounwind {
1506; X86-LABEL: test_mm256_maskload_epi32:
1507; X86:       # %bb.0:
1508; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1509; X86-NEXT:    vpmaskmovd (%eax), %ymm0, %ymm0
1510; X86-NEXT:    retl
1511;
1512; X64-LABEL: test_mm256_maskload_epi32:
1513; X64:       # %bb.0:
1514; X64-NEXT:    vpmaskmovd (%rdi), %ymm0, %ymm0
1515; X64-NEXT:    retq
1516  %arg0 = bitcast i32* %a0 to i8*
1517  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1518  %call = call <8 x i32> @llvm.x86.avx2.maskload.d.256(i8* %arg0, <8 x i32> %arg1)
1519  %bc = bitcast <8 x i32> %call to <4 x i64>
1520  ret <4 x i64> %bc
1521}
1522declare <8 x i32> @llvm.x86.avx2.maskload.d.256(i8*, <8 x i32>) nounwind readonly
1523
1524define <2 x i64> @test_mm_maskload_epi64(i64* %a0, <2 x i64> %a1) nounwind {
1525; X86-LABEL: test_mm_maskload_epi64:
1526; X86:       # %bb.0:
1527; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1528; X86-NEXT:    vpmaskmovq (%eax), %xmm0, %xmm0
1529; X86-NEXT:    retl
1530;
1531; X64-LABEL: test_mm_maskload_epi64:
1532; X64:       # %bb.0:
1533; X64-NEXT:    vpmaskmovq (%rdi), %xmm0, %xmm0
1534; X64-NEXT:    retq
1535  %arg0 = bitcast i64* %a0 to i8*
1536  %res = call <2 x i64> @llvm.x86.avx2.maskload.q(i8* %arg0, <2 x i64> %a1)
1537  ret <2 x i64> %res
1538}
1539declare <2 x i64> @llvm.x86.avx2.maskload.q(i8*, <2 x i64>) nounwind readonly
1540
1541define <4 x i64> @test_mm256_maskload_epi64(i64* %a0, <4 x i64> %a1) nounwind {
1542; X86-LABEL: test_mm256_maskload_epi64:
1543; X86:       # %bb.0:
1544; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1545; X86-NEXT:    vpmaskmovq (%eax), %ymm0, %ymm0
1546; X86-NEXT:    retl
1547;
1548; X64-LABEL: test_mm256_maskload_epi64:
1549; X64:       # %bb.0:
1550; X64-NEXT:    vpmaskmovq (%rdi), %ymm0, %ymm0
1551; X64-NEXT:    retq
1552  %arg0 = bitcast i64* %a0 to i8*
1553  %res = call <4 x i64> @llvm.x86.avx2.maskload.q.256(i8* %arg0, <4 x i64> %a1)
1554  ret <4 x i64> %res
1555}
1556declare <4 x i64> @llvm.x86.avx2.maskload.q.256(i8*, <4 x i64>) nounwind readonly
1557
1558define void @test_mm_maskstore_epi32(float* %a0, <2 x i64> %a1, <2 x i64> %a2) nounwind {
1559; X86-LABEL: test_mm_maskstore_epi32:
1560; X86:       # %bb.0:
1561; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1562; X86-NEXT:    vpmaskmovd %xmm1, %xmm0, (%eax)
1563; X86-NEXT:    retl
1564;
1565; X64-LABEL: test_mm_maskstore_epi32:
1566; X64:       # %bb.0:
1567; X64-NEXT:    vpmaskmovd %xmm1, %xmm0, (%rdi)
1568; X64-NEXT:    retq
1569  %arg0 = bitcast float* %a0 to i8*
1570  %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
1571  %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
1572  call void @llvm.x86.avx2.maskstore.d(i8* %arg0, <4 x i32> %arg1, <4 x i32> %arg2)
1573  ret void
1574}
1575declare void @llvm.x86.avx2.maskstore.d(i8*, <4 x i32>, <4 x i32>) nounwind readnone
1576
1577define void @test_mm256_maskstore_epi32(float* %a0, <4 x i64> %a1, <4 x i64> %a2) nounwind {
1578; X86-LABEL: test_mm256_maskstore_epi32:
1579; X86:       # %bb.0:
1580; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1581; X86-NEXT:    vpmaskmovd %ymm1, %ymm0, (%eax)
1582; X86-NEXT:    vzeroupper
1583; X86-NEXT:    retl
1584;
1585; X64-LABEL: test_mm256_maskstore_epi32:
1586; X64:       # %bb.0:
1587; X64-NEXT:    vpmaskmovd %ymm1, %ymm0, (%rdi)
1588; X64-NEXT:    vzeroupper
1589; X64-NEXT:    retq
1590  %arg0 = bitcast float* %a0 to i8*
1591  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1592  %arg2 = bitcast <4 x i64> %a2 to <8 x i32>
1593  call void @llvm.x86.avx2.maskstore.d.256(i8* %arg0, <8 x i32> %arg1, <8 x i32> %arg2)
1594  ret void
1595}
1596declare void @llvm.x86.avx2.maskstore.d.256(i8*, <8 x i32>, <8 x i32>) nounwind readnone
1597
1598define void @test_mm_maskstore_epi64(i64* %a0, <2 x i64> %a1, <2 x i64> %a2) nounwind {
1599; X86-LABEL: test_mm_maskstore_epi64:
1600; X86:       # %bb.0:
1601; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1602; X86-NEXT:    vpmaskmovq %xmm1, %xmm0, (%eax)
1603; X86-NEXT:    retl
1604;
1605; X64-LABEL: test_mm_maskstore_epi64:
1606; X64:       # %bb.0:
1607; X64-NEXT:    vpmaskmovq %xmm1, %xmm0, (%rdi)
1608; X64-NEXT:    retq
1609  %arg0 = bitcast i64* %a0 to i8*
1610  call void @llvm.x86.avx2.maskstore.q(i8* %arg0, <2 x i64> %a1, <2 x i64> %a2)
1611  ret void
1612}
1613declare void @llvm.x86.avx2.maskstore.q(i8*, <2 x i64>, <2 x i64>) nounwind readnone
1614
1615define void @test_mm256_maskstore_epi64(i64* %a0, <4 x i64> %a1, <4 x i64> %a2) nounwind {
1616; X86-LABEL: test_mm256_maskstore_epi64:
1617; X86:       # %bb.0:
1618; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1619; X86-NEXT:    vpmaskmovq %ymm1, %ymm0, (%eax)
1620; X86-NEXT:    vzeroupper
1621; X86-NEXT:    retl
1622;
1623; X64-LABEL: test_mm256_maskstore_epi64:
1624; X64:       # %bb.0:
1625; X64-NEXT:    vpmaskmovq %ymm1, %ymm0, (%rdi)
1626; X64-NEXT:    vzeroupper
1627; X64-NEXT:    retq
1628  %arg0 = bitcast i64* %a0 to i8*
1629  call void @llvm.x86.avx2.maskstore.q.256(i8* %arg0, <4 x i64> %a1, <4 x i64> %a2)
1630  ret void
1631}
1632declare void @llvm.x86.avx2.maskstore.q.256(i8*, <4 x i64>, <4 x i64>) nounwind readnone
1633
1634define <4 x i64> @test_mm256_max_epi8(<4 x i64> %a0, <4 x i64> %a1) {
1635; CHECK-LABEL: test_mm256_max_epi8:
1636; CHECK:       # %bb.0:
1637; CHECK-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0
1638; CHECK-NEXT:    ret{{[l|q]}}
1639  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
1640  %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
1641  %cmp = icmp sgt <32 x i8> %arg0, %arg1
1642  %sel = select <32 x i1> %cmp, <32 x i8> %arg0, <32 x i8> %arg1
1643  %bc = bitcast <32 x i8> %sel to <4 x i64>
1644  ret <4 x i64> %bc
1645}
1646
1647define <4 x i64> @test_mm256_max_epi16(<4 x i64> %a0, <4 x i64> %a1) {
1648; CHECK-LABEL: test_mm256_max_epi16:
1649; CHECK:       # %bb.0:
1650; CHECK-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm0
1651; CHECK-NEXT:    ret{{[l|q]}}
1652  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
1653  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
1654  %cmp = icmp sgt <16 x i16> %arg0, %arg1
1655  %sel = select <16 x i1> %cmp, <16 x i16> %arg0, <16 x i16> %arg1
1656  %bc = bitcast <16 x i16> %sel to <4 x i64>
1657  ret <4 x i64> %bc
1658}
1659
1660define <4 x i64> @test_mm256_max_epi32(<4 x i64> %a0, <4 x i64> %a1) {
1661; CHECK-LABEL: test_mm256_max_epi32:
1662; CHECK:       # %bb.0:
1663; CHECK-NEXT:    vpmaxsd %ymm1, %ymm0, %ymm0
1664; CHECK-NEXT:    ret{{[l|q]}}
1665  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
1666  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1667  %cmp = icmp sgt <8 x i32> %arg0, %arg1
1668  %sel = select <8 x i1> %cmp, <8 x i32> %arg0, <8 x i32> %arg1
1669  %bc = bitcast <8 x i32> %sel to <4 x i64>
1670  ret <4 x i64> %bc
1671}
1672
1673define <4 x i64> @test_mm256_max_epu8(<4 x i64> %a0, <4 x i64> %a1) {
1674; CHECK-LABEL: test_mm256_max_epu8:
1675; CHECK:       # %bb.0:
1676; CHECK-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0
1677; CHECK-NEXT:    ret{{[l|q]}}
1678  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
1679  %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
1680  %cmp = icmp ugt <32 x i8> %arg0, %arg1
1681  %sel = select <32 x i1> %cmp, <32 x i8> %arg0, <32 x i8> %arg1
1682  %bc = bitcast <32 x i8> %sel to <4 x i64>
1683  ret <4 x i64> %bc
1684}
1685
1686define <4 x i64> @test_mm256_max_epu16(<4 x i64> %a0, <4 x i64> %a1) {
1687; CHECK-LABEL: test_mm256_max_epu16:
1688; CHECK:       # %bb.0:
1689; CHECK-NEXT:    vpmaxuw %ymm1, %ymm0, %ymm0
1690; CHECK-NEXT:    ret{{[l|q]}}
1691  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
1692  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
1693  %cmp = icmp ugt <16 x i16> %arg0, %arg1
1694  %sel = select <16 x i1> %cmp, <16 x i16> %arg0, <16 x i16> %arg1
1695  %bc = bitcast <16 x i16> %sel to <4 x i64>
1696  ret <4 x i64> %bc
1697}
1698
1699define <4 x i64> @test_mm256_max_epu32(<4 x i64> %a0, <4 x i64> %a1) {
1700; CHECK-LABEL: test_mm256_max_epu32:
1701; CHECK:       # %bb.0:
1702; CHECK-NEXT:    vpmaxud %ymm1, %ymm0, %ymm0
1703; CHECK-NEXT:    ret{{[l|q]}}
1704  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
1705  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1706  %cmp = icmp ugt <8 x i32> %arg0, %arg1
1707  %sel = select <8 x i1> %cmp, <8 x i32> %arg0, <8 x i32> %arg1
1708  %bc = bitcast <8 x i32> %sel to <4 x i64>
1709  ret <4 x i64> %bc
1710}
1711
1712define <4 x i64> @test_mm256_min_epi8(<4 x i64> %a0, <4 x i64> %a1) {
1713; CHECK-LABEL: test_mm256_min_epi8:
1714; CHECK:       # %bb.0:
1715; CHECK-NEXT:    vpminsb %ymm1, %ymm0, %ymm0
1716; CHECK-NEXT:    ret{{[l|q]}}
1717  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
1718  %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
1719  %cmp = icmp slt <32 x i8> %arg0, %arg1
1720  %sel = select <32 x i1> %cmp, <32 x i8> %arg0, <32 x i8> %arg1
1721  %bc = bitcast <32 x i8> %sel to <4 x i64>
1722  ret <4 x i64> %bc
1723}
1724
1725define <4 x i64> @test_mm256_min_epi16(<4 x i64> %a0, <4 x i64> %a1) {
1726; CHECK-LABEL: test_mm256_min_epi16:
1727; CHECK:       # %bb.0:
1728; CHECK-NEXT:    vpminsw %ymm1, %ymm0, %ymm0
1729; CHECK-NEXT:    ret{{[l|q]}}
1730  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
1731  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
1732  %cmp = icmp slt <16 x i16> %arg0, %arg1
1733  %sel = select <16 x i1> %cmp, <16 x i16> %arg0, <16 x i16> %arg1
1734  %bc = bitcast <16 x i16> %sel to <4 x i64>
1735  ret <4 x i64> %bc
1736}
1737
1738define <4 x i64> @test_mm256_min_epi32(<4 x i64> %a0, <4 x i64> %a1) {
1739; CHECK-LABEL: test_mm256_min_epi32:
1740; CHECK:       # %bb.0:
1741; CHECK-NEXT:    vpminsd %ymm1, %ymm0, %ymm0
1742; CHECK-NEXT:    ret{{[l|q]}}
1743  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
1744  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1745  %cmp = icmp slt <8 x i32> %arg0, %arg1
1746  %sel = select <8 x i1> %cmp, <8 x i32> %arg0, <8 x i32> %arg1
1747  %bc = bitcast <8 x i32> %sel to <4 x i64>
1748  ret <4 x i64> %bc
1749}
1750
1751define <4 x i64> @test_mm256_min_epu8(<4 x i64> %a0, <4 x i64> %a1) {
1752; CHECK-LABEL: test_mm256_min_epu8:
1753; CHECK:       # %bb.0:
1754; CHECK-NEXT:    vpminub %ymm1, %ymm0, %ymm0
1755; CHECK-NEXT:    ret{{[l|q]}}
1756  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
1757  %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
1758  %cmp = icmp ult <32 x i8> %arg0, %arg1
1759  %sel = select <32 x i1> %cmp, <32 x i8> %arg0, <32 x i8> %arg1
1760  %bc = bitcast <32 x i8> %sel to <4 x i64>
1761  ret <4 x i64> %bc
1762}
1763
1764define <4 x i64> @test_mm256_min_epu16(<4 x i64> %a0, <4 x i64> %a1) {
1765; CHECK-LABEL: test_mm256_min_epu16:
1766; CHECK:       # %bb.0:
1767; CHECK-NEXT:    vpminuw %ymm1, %ymm0, %ymm0
1768; CHECK-NEXT:    ret{{[l|q]}}
1769  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
1770  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
1771  %cmp = icmp ult <16 x i16> %arg0, %arg1
1772  %sel = select <16 x i1> %cmp, <16 x i16> %arg0, <16 x i16> %arg1
1773  %bc = bitcast <16 x i16> %sel to <4 x i64>
1774  ret <4 x i64> %bc
1775}
1776
1777define <4 x i64> @test_mm256_min_epu32(<4 x i64> %a0, <4 x i64> %a1) {
1778; CHECK-LABEL: test_mm256_min_epu32:
1779; CHECK:       # %bb.0:
1780; CHECK-NEXT:    vpminud %ymm1, %ymm0, %ymm0
1781; CHECK-NEXT:    ret{{[l|q]}}
1782  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
1783  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1784  %cmp = icmp ult <8 x i32> %arg0, %arg1
1785  %sel = select <8 x i1> %cmp, <8 x i32> %arg0, <8 x i32> %arg1
1786  %bc = bitcast <8 x i32> %sel to <4 x i64>
1787  ret <4 x i64> %bc
1788}
1789
1790define i32 @test_mm256_movemask_epi8(<4 x i64> %a0) nounwind {
1791; CHECK-LABEL: test_mm256_movemask_epi8:
1792; CHECK:       # %bb.0:
1793; CHECK-NEXT:    vpmovmskb %ymm0, %eax
1794; CHECK-NEXT:    vzeroupper
1795; CHECK-NEXT:    ret{{[l|q]}}
1796  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
1797  %res = call i32 @llvm.x86.avx2.pmovmskb(<32 x i8> %arg0)
1798  ret i32 %res
1799}
1800declare i32 @llvm.x86.avx2.pmovmskb(<32 x i8>) nounwind readnone
1801
1802define <4 x i64> @test_mm256_mpsadbw_epu8(<4 x i64> %a0, <4 x i64> %a1) {
1803; CHECK-LABEL: test_mm256_mpsadbw_epu8:
1804; CHECK:       # %bb.0:
1805; CHECK-NEXT:    vmpsadbw $3, %ymm1, %ymm0, %ymm0
1806; CHECK-NEXT:    ret{{[l|q]}}
1807  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
1808  %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
1809  %call = call <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8> %arg0, <32 x i8> %arg1, i8 3)
1810  %bc = bitcast <16 x i16>  %call to <4 x i64>
1811  ret <4 x i64> %bc
1812}
1813declare <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8>, <32 x i8>, i8) nounwind readnone
1814
1815define <4 x i64> @test_mm256_mul_epi32(<4 x i64> %a0, <4 x i64> %a1) {
1816; CHECK-LABEL: test_mm256_mul_epi32:
1817; CHECK:       # %bb.0:
1818; CHECK-NEXT:    vpmuldq %ymm1, %ymm0, %ymm0
1819; CHECK-NEXT:    ret{{[l|q]}}
1820  %A = shl <4 x i64> %a0, <i64 32, i64 32, i64 32, i64 32>
1821  %A1 = ashr exact <4 x i64> %A, <i64 32, i64 32, i64 32, i64 32>
1822  %B = shl <4 x i64> %a1, <i64 32, i64 32, i64 32, i64 32>
1823  %B1 = ashr exact <4 x i64> %B, <i64 32, i64 32, i64 32, i64 32>
1824  %res = mul nsw <4 x i64> %A1, %B1
1825  ret <4 x i64> %res
1826}
1827declare <4 x i64> @llvm.x86.avx2.pmul.dq(<8 x i32>, <8 x i32>) nounwind readnone
1828
1829define <4 x i64> @test_mm256_mul_epu32(<4 x i64> %a0, <4 x i64> %a1) {
1830; CHECK-LABEL: test_mm256_mul_epu32:
1831; CHECK:       # %bb.0:
1832; CHECK-NEXT:    vpmuludq %ymm1, %ymm0, %ymm0
1833; CHECK-NEXT:    ret{{[l|q]}}
1834  %A = and <4 x i64> %a0, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
1835  %B = and <4 x i64> %a1, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
1836  %res = mul nuw <4 x i64> %A, %B
1837  ret <4 x i64> %res
1838}
1839declare <4 x i64> @llvm.x86.avx2.pmulu.dq(<8 x i32>, <8 x i32>) nounwind readnone
1840
1841define <4 x i64> @test_mm256_mulhi_epi16(<4 x i64> %a0, <4 x i64> %a1) {
1842; CHECK-LABEL: test_mm256_mulhi_epi16:
1843; CHECK:       # %bb.0:
1844; CHECK-NEXT:    vpmulhw %ymm1, %ymm0, %ymm0
1845; CHECK-NEXT:    ret{{[l|q]}}
1846  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
1847  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
1848  %res = call <16 x i16> @llvm.x86.avx2.pmulh.w(<16 x i16> %arg0, <16 x i16> %arg1)
1849  %bc = bitcast <16 x i16> %res to <4 x i64>
1850  ret <4 x i64> %bc
1851}
1852declare <16 x i16> @llvm.x86.avx2.pmulh.w(<16 x i16>, <16 x i16>) nounwind readnone
1853
1854define <4 x i64> @test_mm256_mulhi_epu16(<4 x i64> %a0, <4 x i64> %a1) {
1855; CHECK-LABEL: test_mm256_mulhi_epu16:
1856; CHECK:       # %bb.0:
1857; CHECK-NEXT:    vpmulhuw %ymm1, %ymm0, %ymm0
1858; CHECK-NEXT:    ret{{[l|q]}}
1859  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
1860  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
1861  %res = call <16 x i16> @llvm.x86.avx2.pmulhu.w(<16 x i16> %arg0, <16 x i16> %arg1)
1862  %bc = bitcast <16 x i16> %res to <4 x i64>
1863  ret <4 x i64> %bc
1864}
1865declare <16 x i16> @llvm.x86.avx2.pmulhu.w(<16 x i16>, <16 x i16>) nounwind readnone
1866
1867define <4 x i64> @test_mm256_mulhrs_epi16(<4 x i64> %a0, <4 x i64> %a1) {
1868; CHECK-LABEL: test_mm256_mulhrs_epi16:
1869; CHECK:       # %bb.0:
1870; CHECK-NEXT:    vpmulhrsw %ymm1, %ymm0, %ymm0
1871; CHECK-NEXT:    ret{{[l|q]}}
1872  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
1873  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
1874  %res = call <16 x i16> @llvm.x86.avx2.pmul.hr.sw(<16 x i16> %arg0, <16 x i16> %arg1)
1875  %bc = bitcast <16 x i16> %res to <4 x i64>
1876  ret <4 x i64> %bc
1877}
1878declare <16 x i16> @llvm.x86.avx2.pmul.hr.sw(<16 x i16>, <16 x i16>) nounwind readnone
1879
1880define <4 x i64> @test_mm256_mullo_epi16(<4 x i64> %a0, <4 x i64> %a1) {
1881; CHECK-LABEL: test_mm256_mullo_epi16:
1882; CHECK:       # %bb.0:
1883; CHECK-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
1884; CHECK-NEXT:    ret{{[l|q]}}
1885  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
1886  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
1887  %res = mul <16 x i16> %arg0, %arg1
1888  %bc = bitcast <16 x i16> %res to <4 x i64>
1889  ret <4 x i64> %bc
1890}
1891
1892define <4 x i64> @test_mm256_mullo_epi32(<4 x i64> %a0, <4 x i64> %a1) {
1893; CHECK-LABEL: test_mm256_mullo_epi32:
1894; CHECK:       # %bb.0:
1895; CHECK-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
1896; CHECK-NEXT:    ret{{[l|q]}}
1897  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
1898  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1899  %res = mul <8 x i32> %arg0, %arg1
1900  %bc = bitcast <8 x i32> %res to <4 x i64>
1901  ret <4 x i64> %bc
1902}
1903
1904define <4 x i64> @test_mm256_or_si256(<4 x i64> %a0, <4 x i64> %a1) nounwind {
1905; CHECK-LABEL: test_mm256_or_si256:
1906; CHECK:       # %bb.0:
1907; CHECK-NEXT:    vorps %ymm1, %ymm0, %ymm0
1908; CHECK-NEXT:    ret{{[l|q]}}
1909  %res = or <4 x i64> %a0, %a1
1910  ret <4 x i64> %res
1911}
1912
1913define <4 x i64> @test_mm256_packs_epi16(<4 x i64> %a0, <4 x i64> %a1) {
1914; CHECK-LABEL: test_mm256_packs_epi16:
1915; CHECK:       # %bb.0:
1916; CHECK-NEXT:    vpacksswb %ymm1, %ymm0, %ymm0
1917; CHECK-NEXT:    ret{{[l|q]}}
1918  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
1919  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
1920  %call = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %arg0, <16 x i16> %arg1)
1921  %res = bitcast <32 x i8> %call to <4 x i64>
1922  ret <4 x i64> %res
1923}
1924declare <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16>, <16 x i16>) nounwind readnone
1925
1926define <4 x i64> @test_mm256_packs_epi32(<4 x i64> %a0, <4 x i64> %a1) {
1927; CHECK-LABEL: test_mm256_packs_epi32:
1928; CHECK:       # %bb.0:
1929; CHECK-NEXT:    vpackssdw %ymm1, %ymm0, %ymm0
1930; CHECK-NEXT:    ret{{[l|q]}}
1931  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
1932  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1933  %call = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %arg0, <8 x i32> %arg1)
1934  %res = bitcast <16 x i16> %call to <4 x i64>
1935  ret <4 x i64> %res
1936}
1937declare <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32>, <8 x i32>) nounwind readnone
1938
1939define <4 x i64> @test_mm256_packus_epi16(<4 x i64> %a0, <4 x i64> %a1) {
1940; CHECK-LABEL: test_mm256_packus_epi16:
1941; CHECK:       # %bb.0:
1942; CHECK-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0
1943; CHECK-NEXT:    ret{{[l|q]}}
1944  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
1945  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
1946  %call = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> %arg0, <16 x i16> %arg1)
1947  %res = bitcast <32 x i8> %call to <4 x i64>
1948  ret <4 x i64> %res
1949}
1950declare <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16>, <16 x i16>) nounwind readnone
1951
1952define <4 x i64> @test_mm256_packus_epi32(<4 x i64> %a0, <4 x i64> %a1) {
1953; CHECK-LABEL: test_mm256_packus_epi32:
1954; CHECK:       # %bb.0:
1955; CHECK-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
1956; CHECK-NEXT:    ret{{[l|q]}}
1957  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
1958  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1959  %call = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %arg0, <8 x i32> %arg1)
1960  %res = bitcast <16 x i16> %call to <4 x i64>
1961  ret <4 x i64> %res
1962}
1963declare <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32>, <8 x i32>) nounwind readnone
1964
1965define <4 x i64> @test_mm256_permute2x128_si256(<4 x i64> %a0, <4 x i64> %a1) {
1966; CHECK-LABEL: test_mm256_permute2x128_si256:
1967; CHECK:       # %bb.0:
1968; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
1969; CHECK-NEXT:    ret{{[l|q]}}
1970  %res = shufflevector <4 x i64> %a0, <4 x i64> %a1, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
1971  ret <4 x i64> %res
1972}
1973declare <4 x i64> @llvm.x86.avx2.vperm2i128(<4 x i64>, <4 x i64>, i8) nounwind readonly
1974
1975define <4 x i64> @test_mm256_permute4x64_epi64(<4 x i64> %a0) {
1976; CHECK-LABEL: test_mm256_permute4x64_epi64:
1977; CHECK:       # %bb.0:
1978; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[3,0,2,0]
1979; CHECK-NEXT:    ret{{[l|q]}}
1980  %res = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> <i32 3, i32 0, i32 2, i32 0>
1981  ret <4 x i64> %res
1982}
1983
1984define <4 x double> @test_mm256_permute4x64_pd(<4 x double> %a0) {
1985; CHECK-LABEL: test_mm256_permute4x64_pd:
1986; CHECK:       # %bb.0:
1987; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[1,2,1,0]
1988; CHECK-NEXT:    ret{{[l|q]}}
1989  %res = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> <i32 1, i32 2, i32 1, i32 0>
1990  ret <4 x double> %res
1991}
1992
1993define <4 x i64> @test_mm256_permutevar8x32_epi32(<4 x i64> %a0, <4 x i64> %a1) {
1994; CHECK-LABEL: test_mm256_permutevar8x32_epi32:
1995; CHECK:       # %bb.0:
1996; CHECK-NEXT:    vpermps %ymm0, %ymm1, %ymm0
1997; CHECK-NEXT:    ret{{[l|q]}}
1998  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
1999  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
2000  %call = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %arg0, <8 x i32> %arg1)
2001  %res = bitcast <8 x i32> %call to <4 x i64>
2002  ret <4 x i64> %res
2003}
2004declare <8 x i32> @llvm.x86.avx2.permd(<8 x i32>, <8 x i32>) nounwind readonly
2005
2006define <8 x float> @test_mm256_permutevar8x32_ps(<8 x float> %a0, <4 x i64> %a1) {
2007; CHECK-LABEL: test_mm256_permutevar8x32_ps:
2008; CHECK:       # %bb.0:
2009; CHECK-NEXT:    vpermps %ymm0, %ymm1, %ymm0
2010; CHECK-NEXT:    ret{{[l|q]}}
2011  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
2012  %res = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a0, <8 x i32> %arg1)
2013  ret <8 x float> %res
2014}
2015declare <8 x float> @llvm.x86.avx2.permps(<8 x float>, <8 x i32>) nounwind readonly
2016
2017define <4 x i64> @test_mm256_sad_epu8(<4 x i64> %a0, <4 x i64> %a1) {
2018; CHECK-LABEL: test_mm256_sad_epu8:
2019; CHECK:       # %bb.0:
2020; CHECK-NEXT:    vpsadbw %ymm1, %ymm0, %ymm0
2021; CHECK-NEXT:    ret{{[l|q]}}
2022  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
2023  %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
2024  %res = call <4 x i64> @llvm.x86.avx2.psad.bw(<32 x i8> %arg0, <32 x i8> %arg1)
2025  ret <4 x i64> %res
2026}
2027declare <4 x i64> @llvm.x86.avx2.psad.bw(<32 x i8>, <32 x i8>) nounwind readnone
2028
2029define <4 x i64> @test_mm256_shuffle_epi32(<4 x i64> %a0) {
2030; CHECK-LABEL: test_mm256_shuffle_epi32:
2031; CHECK:       # %bb.0:
2032; CHECK-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,3,0,0,7,7,4,4]
2033; CHECK-NEXT:    ret{{[l|q]}}
2034  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2035  %shuf = shufflevector <8 x i32> %arg0, <8 x i32> undef, <8 x i32> <i32 3, i32 3, i32 0, i32 0, i32 7, i32 7, i32 4, i32 4>
2036  %res = bitcast <8 x i32> %shuf to <4 x i64>
2037  ret <4 x i64> %res
2038}
2039
2040define <4 x i64> @test_mm256_shuffle_epi8(<4 x i64> %a0, <4 x i64> %a1) {
2041; CHECK-LABEL: test_mm256_shuffle_epi8:
2042; CHECK:       # %bb.0:
2043; CHECK-NEXT:    vpshufb %ymm1, %ymm0, %ymm0
2044; CHECK-NEXT:    ret{{[l|q]}}
2045  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
2046  %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
2047  %shuf = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %arg0, <32 x i8> %arg1)
2048  %res = bitcast <32 x i8> %shuf to <4 x i64>
2049  ret <4 x i64> %res
2050}
2051declare <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8>, <32 x i8>) nounwind readnone
2052
2053define <4 x i64> @test_mm256_shufflehi_epi16(<4 x i64> %a0) {
2054; CHECK-LABEL: test_mm256_shufflehi_epi16:
2055; CHECK:       # %bb.0:
2056; CHECK-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,6,6,5,8,9,10,11,15,14,14,13]
2057; CHECK-NEXT:    ret{{[l|q]}}
2058  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2059  %shuf = shufflevector <16 x i16> %arg0, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 6, i32 6, i32 5, i32 8, i32 9, i32 10, i32 11, i32 15, i32 14, i32 14, i32 13>
2060  %res = bitcast <16 x i16> %shuf to <4 x i64>
2061  ret <4 x i64> %res
2062}
2063
2064define <4 x i64> @test_mm256_shufflelo_epi16(<4 x i64> %a0) {
2065; CHECK-LABEL: test_mm256_shufflelo_epi16:
2066; CHECK:       # %bb.0:
2067; CHECK-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[3,0,1,1,4,5,6,7,11,8,9,9,12,13,14,15]
2068; CHECK-NEXT:    ret{{[l|q]}}
2069  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2070  %shuf = shufflevector <16 x i16> %arg0, <16 x i16> undef, <16 x i32> <i32 3, i32 0, i32 1, i32 1, i32 4, i32 5, i32 6, i32 7, i32 11, i32 8, i32 9, i32 9, i32 12, i32 13, i32 14, i32 15>
2071  %res = bitcast <16 x i16> %shuf to <4 x i64>
2072  ret <4 x i64> %res
2073}
2074
2075define <4 x i64> @test_mm256_sign_epi8(<4 x i64> %a0, <4 x i64> %a1) {
2076; CHECK-LABEL: test_mm256_sign_epi8:
2077; CHECK:       # %bb.0:
2078; CHECK-NEXT:    vpsignb %ymm1, %ymm0, %ymm0
2079; CHECK-NEXT:    ret{{[l|q]}}
2080  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
2081  %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
2082  %call = call <32 x i8> @llvm.x86.avx2.psign.b(<32 x i8> %arg0, <32 x i8> %arg1)
2083  %res = bitcast <32 x i8> %call to <4 x i64>
2084  ret <4 x i64> %res
2085}
2086declare <32 x i8> @llvm.x86.avx2.psign.b(<32 x i8>, <32 x i8>) nounwind readnone
2087
2088define <4 x i64> @test_mm256_sign_epi16(<4 x i64> %a0, <4 x i64> %a1) {
2089; CHECK-LABEL: test_mm256_sign_epi16:
2090; CHECK:       # %bb.0:
2091; CHECK-NEXT:    vpsignw %ymm1, %ymm0, %ymm0
2092; CHECK-NEXT:    ret{{[l|q]}}
2093  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2094  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
2095  %call = call <16 x i16> @llvm.x86.avx2.psign.w(<16 x i16> %arg0, <16 x i16> %arg1)
2096  %res = bitcast <16 x i16> %call to <4 x i64>
2097  ret <4 x i64> %res
2098}
2099declare <16 x i16> @llvm.x86.avx2.psign.w(<16 x i16>, <16 x i16>) nounwind readnone
2100
2101define <4 x i64> @test_mm256_sign_epi32(<4 x i64> %a0, <4 x i64> %a1) {
2102; CHECK-LABEL: test_mm256_sign_epi32:
2103; CHECK:       # %bb.0:
2104; CHECK-NEXT:    vpsignd %ymm1, %ymm0, %ymm0
2105; CHECK-NEXT:    ret{{[l|q]}}
2106  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2107  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
2108  %call = call <8 x i32> @llvm.x86.avx2.psign.d(<8 x i32> %arg0, <8 x i32> %arg1)
2109  %res = bitcast <8 x i32> %call to <4 x i64>
2110  ret <4 x i64> %res
2111}
2112declare <8 x i32> @llvm.x86.avx2.psign.d(<8 x i32>, <8 x i32>) nounwind readnone
2113
2114define <4 x i64> @test_mm256_sll_epi16(<4 x i64> %a0, <2 x i64> %a1) {
2115; CHECK-LABEL: test_mm256_sll_epi16:
2116; CHECK:       # %bb.0:
2117; CHECK-NEXT:    vpsllw %xmm1, %ymm0, %ymm0
2118; CHECK-NEXT:    ret{{[l|q]}}
2119  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2120  %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
2121  %res = call <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16> %arg0, <8 x i16> %arg1)
2122  %bc = bitcast <16 x i16> %res to <4 x i64>
2123  ret <4 x i64> %bc
2124}
2125declare <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16>, <8 x i16>) nounwind readnone
2126
2127define <4 x i64> @test_mm256_sll_epi32(<4 x i64> %a0, <2 x i64> %a1) {
2128; CHECK-LABEL: test_mm256_sll_epi32:
2129; CHECK:       # %bb.0:
2130; CHECK-NEXT:    vpslld %xmm1, %ymm0, %ymm0
2131; CHECK-NEXT:    ret{{[l|q]}}
2132  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2133  %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
2134  %res = call <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32> %arg0, <4 x i32> %arg1)
2135  %bc = bitcast <8 x i32> %res to <4 x i64>
2136  ret <4 x i64> %bc
2137}
2138declare <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32>, <4 x i32>) nounwind readnone
2139
2140define <4 x i64> @test_mm256_sll_epi64(<4 x i64> %a0, <2 x i64> %a1) {
2141; CHECK-LABEL: test_mm256_sll_epi64:
2142; CHECK:       # %bb.0:
2143; CHECK-NEXT:    vpsllq %xmm1, %ymm0, %ymm0
2144; CHECK-NEXT:    ret{{[l|q]}}
2145  %res = call <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64> %a0, <2 x i64> %a1)
2146  ret <4 x i64> %res
2147}
2148declare <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64>, <2 x i64>) nounwind readnone
2149
2150define <4 x i64> @test_mm256_slli_epi16(<4 x i64> %a0) {
2151; CHECK-LABEL: test_mm256_slli_epi16:
2152; CHECK:       # %bb.0:
2153; CHECK-NEXT:    vpsllw $3, %ymm0, %ymm0
2154; CHECK-NEXT:    ret{{[l|q]}}
2155  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2156  %res = call <16 x i16> @llvm.x86.avx2.pslli.w(<16 x i16> %arg0, i32 3)
2157  %bc = bitcast <16 x i16> %res to <4 x i64>
2158  ret <4 x i64> %bc
2159}
2160declare <16 x i16> @llvm.x86.avx2.pslli.w(<16 x i16>, i32) nounwind readnone
2161
2162define <4 x i64> @test_mm256_slli_epi32(<4 x i64> %a0) {
2163; CHECK-LABEL: test_mm256_slli_epi32:
2164; CHECK:       # %bb.0:
2165; CHECK-NEXT:    vpslld $3, %ymm0, %ymm0
2166; CHECK-NEXT:    ret{{[l|q]}}
2167  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2168  %res = call <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32> %arg0, i32 3)
2169  %bc = bitcast <8 x i32> %res to <4 x i64>
2170  ret <4 x i64> %bc
2171}
2172declare <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32>, i32) nounwind readnone
2173
2174define <4 x i64> @test_mm256_slli_epi64(<4 x i64> %a0) {
2175; CHECK-LABEL: test_mm256_slli_epi64:
2176; CHECK:       # %bb.0:
2177; CHECK-NEXT:    vpsllq $3, %ymm0, %ymm0
2178; CHECK-NEXT:    ret{{[l|q]}}
2179  %res = call <4 x i64> @llvm.x86.avx2.pslli.q(<4 x i64> %a0, i32 3)
2180  ret <4 x i64> %res
2181}
2182declare <4 x i64> @llvm.x86.avx2.pslli.q(<4 x i64>, i32) nounwind readnone
2183
2184define <4 x i64> @test_mm256_slli_si256(<4 x i64> %a0) {
2185; CHECK-LABEL: test_mm256_slli_si256:
2186; CHECK:       # %bb.0:
2187; CHECK-NEXT:    vpslldq {{.*#+}} ymm0 = zero,zero,zero,ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12],zero,zero,zero,ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28]
2188; CHECK-NEXT:    ret{{[l|q]}}
2189  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
2190  %shuf = shufflevector <32 x i8> zeroinitializer, <32 x i8> %arg0, <32 x i32> <i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60>
2191  %res = bitcast <32 x i8> %shuf to <4 x i64>
2192  ret <4 x i64> %res
2193}
2194
2195define <2 x i64> @test_mm_sllv_epi32(<2 x i64> %a0, <2 x i64> %a1) {
2196; CHECK-LABEL: test_mm_sllv_epi32:
2197; CHECK:       # %bb.0:
2198; CHECK-NEXT:    vpsllvd %xmm1, %xmm0, %xmm0
2199; CHECK-NEXT:    ret{{[l|q]}}
2200  %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
2201  %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
2202  %res = call <4 x i32> @llvm.x86.avx2.psllv.d(<4 x i32> %arg0, <4 x i32> %arg1)
2203  %bc = bitcast <4 x i32> %res to <2 x i64>
2204  ret <2 x i64> %bc
2205}
2206declare <4 x i32> @llvm.x86.avx2.psllv.d(<4 x i32>, <4 x i32>) nounwind readnone
2207
2208define <4 x i64> @test_mm256_sllv_epi32(<4 x i64> %a0, <4 x i64> %a1) {
2209; CHECK-LABEL: test_mm256_sllv_epi32:
2210; CHECK:       # %bb.0:
2211; CHECK-NEXT:    vpsllvd %ymm1, %ymm0, %ymm0
2212; CHECK-NEXT:    ret{{[l|q]}}
2213  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2214  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
2215  %res = call <8 x i32> @llvm.x86.avx2.psllv.d.256(<8 x i32> %arg0, <8 x i32> %arg1)
2216  %bc = bitcast <8 x i32> %res to <4 x i64>
2217  ret <4 x i64> %bc
2218}
2219declare <8 x i32> @llvm.x86.avx2.psllv.d.256(<8 x i32>, <8 x i32>) nounwind readnone
2220
2221define <2 x i64> @test_mm_sllv_epi64(<2 x i64> %a0, <2 x i64> %a1) {
2222; CHECK-LABEL: test_mm_sllv_epi64:
2223; CHECK:       # %bb.0:
2224; CHECK-NEXT:    vpsllvq %xmm1, %xmm0, %xmm0
2225; CHECK-NEXT:    ret{{[l|q]}}
2226  %res = call <2 x i64> @llvm.x86.avx2.psllv.q(<2 x i64> %a0, <2 x i64> %a1)
2227  ret <2 x i64> %res
2228}
2229declare <2 x i64> @llvm.x86.avx2.psllv.q(<2 x i64>, <2 x i64>) nounwind readnone
2230
2231define <4 x i64> @test_mm256_sllv_epi64(<4 x i64> %a0, <4 x i64> %a1) {
2232; CHECK-LABEL: test_mm256_sllv_epi64:
2233; CHECK:       # %bb.0:
2234; CHECK-NEXT:    vpsllvq %ymm1, %ymm0, %ymm0
2235; CHECK-NEXT:    ret{{[l|q]}}
2236  %res = call <4 x i64> @llvm.x86.avx2.psllv.q.256(<4 x i64> %a0, <4 x i64> %a1)
2237  ret <4 x i64> %res
2238}
2239declare <4 x i64> @llvm.x86.avx2.psllv.q.256(<4 x i64>, <4 x i64>) nounwind readnone
2240
2241define <4 x i64> @test_mm256_sra_epi16(<4 x i64> %a0, <2 x i64> %a1) {
2242; CHECK-LABEL: test_mm256_sra_epi16:
2243; CHECK:       # %bb.0:
2244; CHECK-NEXT:    vpsraw %xmm1, %ymm0, %ymm0
2245; CHECK-NEXT:    ret{{[l|q]}}
2246  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2247  %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
2248  %res = call <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16> %arg0, <8 x i16> %arg1)
2249  %bc = bitcast <16 x i16> %res to <4 x i64>
2250  ret <4 x i64> %bc
2251}
2252declare <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16>, <8 x i16>) nounwind readnone
2253
2254define <4 x i64> @test_mm256_sra_epi32(<4 x i64> %a0, <2 x i64> %a1) {
2255; CHECK-LABEL: test_mm256_sra_epi32:
2256; CHECK:       # %bb.0:
2257; CHECK-NEXT:    vpsrad %xmm1, %ymm0, %ymm0
2258; CHECK-NEXT:    ret{{[l|q]}}
2259  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2260  %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
2261  %res = call <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32> %arg0, <4 x i32> %arg1)
2262  %bc = bitcast <8 x i32> %res to <4 x i64>
2263  ret <4 x i64> %bc
2264}
2265declare <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32>, <4 x i32>) nounwind readnone
2266
2267define <4 x i64> @test_mm256_srai_epi16(<4 x i64> %a0) {
2268; CHECK-LABEL: test_mm256_srai_epi16:
2269; CHECK:       # %bb.0:
2270; CHECK-NEXT:    vpsraw $3, %ymm0, %ymm0
2271; CHECK-NEXT:    ret{{[l|q]}}
2272  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2273  %res = call <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16> %arg0, i32 3)
2274  %bc = bitcast <16 x i16> %res to <4 x i64>
2275  ret <4 x i64> %bc
2276}
2277declare <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16>, i32) nounwind readnone
2278
2279define <4 x i64> @test_mm256_srai_epi32(<4 x i64> %a0) {
2280; CHECK-LABEL: test_mm256_srai_epi32:
2281; CHECK:       # %bb.0:
2282; CHECK-NEXT:    vpsrad $3, %ymm0, %ymm0
2283; CHECK-NEXT:    ret{{[l|q]}}
2284  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2285  %res = call <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32> %arg0, i32 3)
2286  %bc = bitcast <8 x i32> %res to <4 x i64>
2287  ret <4 x i64> %bc
2288}
2289declare <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32>, i32) nounwind readnone
2290
2291define <2 x i64> @test_mm_srav_epi32(<2 x i64> %a0, <2 x i64> %a1) {
2292; CHECK-LABEL: test_mm_srav_epi32:
2293; CHECK:       # %bb.0:
2294; CHECK-NEXT:    vpsravd %xmm1, %xmm0, %xmm0
2295; CHECK-NEXT:    ret{{[l|q]}}
2296  %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
2297  %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
2298  %res = call <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32> %arg0, <4 x i32> %arg1)
2299  %bc = bitcast <4 x i32> %res to <2 x i64>
2300  ret <2 x i64> %bc
2301}
2302declare <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32>, <4 x i32>) nounwind readnone
2303
2304define <4 x i64> @test_mm256_srav_epi32(<4 x i64> %a0, <4 x i64> %a1) {
2305; CHECK-LABEL: test_mm256_srav_epi32:
2306; CHECK:       # %bb.0:
2307; CHECK-NEXT:    vpsravd %ymm1, %ymm0, %ymm0
2308; CHECK-NEXT:    ret{{[l|q]}}
2309  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2310  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
2311  %res = call <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32> %arg0, <8 x i32> %arg1)
2312  %bc = bitcast <8 x i32> %res to <4 x i64>
2313  ret <4 x i64> %bc
2314}
2315declare <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32>, <8 x i32>) nounwind readnone
2316
2317define <4 x i64> @test_mm256_srl_epi16(<4 x i64> %a0, <2 x i64> %a1) {
2318; CHECK-LABEL: test_mm256_srl_epi16:
2319; CHECK:       # %bb.0:
2320; CHECK-NEXT:    vpsrlw %xmm1, %ymm0, %ymm0
2321; CHECK-NEXT:    ret{{[l|q]}}
2322  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2323  %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
2324  %res = call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> %arg0, <8 x i16> %arg1)
2325  %bc = bitcast <16 x i16> %res to <4 x i64>
2326  ret <4 x i64> %bc
2327}
2328declare <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16>, <8 x i16>) nounwind readnone
2329
2330define <4 x i64> @test_mm256_srl_epi32(<4 x i64> %a0, <2 x i64> %a1) {
2331; CHECK-LABEL: test_mm256_srl_epi32:
2332; CHECK:       # %bb.0:
2333; CHECK-NEXT:    vpsrld %xmm1, %ymm0, %ymm0
2334; CHECK-NEXT:    ret{{[l|q]}}
2335  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2336  %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
2337  %res = call <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32> %arg0, <4 x i32> %arg1)
2338  %bc = bitcast <8 x i32> %res to <4 x i64>
2339  ret <4 x i64> %bc
2340}
2341declare <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32>, <4 x i32>) nounwind readnone
2342
2343define <4 x i64> @test_mm256_srl_epi64(<4 x i64> %a0, <2 x i64> %a1) {
2344; CHECK-LABEL: test_mm256_srl_epi64:
2345; CHECK:       # %bb.0:
2346; CHECK-NEXT:    vpsrlq %xmm1, %ymm0, %ymm0
2347; CHECK-NEXT:    ret{{[l|q]}}
2348  %res = call <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64> %a0, <2 x i64> %a1)
2349  ret <4 x i64> %res
2350}
2351declare <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64>, <2 x i64>) nounwind readnone
2352
2353define <4 x i64> @test_mm256_srli_epi16(<4 x i64> %a0) {
2354; CHECK-LABEL: test_mm256_srli_epi16:
2355; CHECK:       # %bb.0:
2356; CHECK-NEXT:    vpsrlw $3, %ymm0, %ymm0
2357; CHECK-NEXT:    ret{{[l|q]}}
2358  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2359  %res = call <16 x i16> @llvm.x86.avx2.psrli.w(<16 x i16> %arg0, i32 3)
2360  %bc = bitcast <16 x i16> %res to <4 x i64>
2361  ret <4 x i64> %bc
2362}
2363declare <16 x i16> @llvm.x86.avx2.psrli.w(<16 x i16>, i32) nounwind readnone
2364
2365define <4 x i64> @test_mm256_srli_epi32(<4 x i64> %a0) {
2366; CHECK-LABEL: test_mm256_srli_epi32:
2367; CHECK:       # %bb.0:
2368; CHECK-NEXT:    vpsrld $3, %ymm0, %ymm0
2369; CHECK-NEXT:    ret{{[l|q]}}
2370  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2371  %res = call <8 x i32> @llvm.x86.avx2.psrli.d(<8 x i32> %arg0, i32 3)
2372  %bc = bitcast <8 x i32> %res to <4 x i64>
2373  ret <4 x i64> %bc
2374}
2375declare <8 x i32> @llvm.x86.avx2.psrli.d(<8 x i32>, i32) nounwind readnone
2376
2377define <4 x i64> @test_mm256_srli_epi64(<4 x i64> %a0) {
2378; CHECK-LABEL: test_mm256_srli_epi64:
2379; CHECK:       # %bb.0:
2380; CHECK-NEXT:    vpsrlq $3, %ymm0, %ymm0
2381; CHECK-NEXT:    ret{{[l|q]}}
2382  %res = call <4 x i64> @llvm.x86.avx2.psrli.q(<4 x i64> %a0, i32 3)
2383  ret <4 x i64> %res
2384}
2385declare <4 x i64> @llvm.x86.avx2.psrli.q(<4 x i64>, i32) nounwind readnone
2386
2387define <4 x i64> @test_mm256_srli_si256(<4 x i64> %a0) {
2388; CHECK-LABEL: test_mm256_srli_si256:
2389; CHECK:       # %bb.0:
2390; CHECK-NEXT:    vpsrldq {{.*#+}} ymm0 = ymm0[3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,ymm0[19,20,21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero
2391; CHECK-NEXT:    ret{{[l|q]}}
2392  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
2393  %shuf = shufflevector <32 x i8> %arg0, <32 x i8> zeroinitializer, <32 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50>
2394  %res = bitcast <32 x i8> %shuf to <4 x i64>
2395  ret <4 x i64> %res
2396}
2397
2398define <2 x i64> @test_mm_srlv_epi32(<2 x i64> %a0, <2 x i64> %a1) {
2399; CHECK-LABEL: test_mm_srlv_epi32:
2400; CHECK:       # %bb.0:
2401; CHECK-NEXT:    vpsrlvd %xmm1, %xmm0, %xmm0
2402; CHECK-NEXT:    ret{{[l|q]}}
2403  %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
2404  %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
2405  %res = call <4 x i32> @llvm.x86.avx2.psrlv.d(<4 x i32> %arg0, <4 x i32> %arg1)
2406  %bc = bitcast <4 x i32> %res to <2 x i64>
2407  ret <2 x i64> %bc
2408}
2409declare <4 x i32> @llvm.x86.avx2.psrlv.d(<4 x i32>, <4 x i32>) nounwind readnone
2410
2411define <4 x i64> @test_mm256_srlv_epi32(<4 x i64> %a0, <4 x i64> %a1) {
2412; CHECK-LABEL: test_mm256_srlv_epi32:
2413; CHECK:       # %bb.0:
2414; CHECK-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm0
2415; CHECK-NEXT:    ret{{[l|q]}}
2416  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2417  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
2418  %res = call <8 x i32> @llvm.x86.avx2.psrlv.d.256(<8 x i32> %arg0, <8 x i32> %arg1)
2419  %bc = bitcast <8 x i32> %res to <4 x i64>
2420  ret <4 x i64> %bc
2421}
2422declare <8 x i32> @llvm.x86.avx2.psrlv.d.256(<8 x i32>, <8 x i32>) nounwind readnone
2423
2424define <2 x i64> @test_mm_srlv_epi64(<2 x i64> %a0, <2 x i64> %a1) {
2425; CHECK-LABEL: test_mm_srlv_epi64:
2426; CHECK:       # %bb.0:
2427; CHECK-NEXT:    vpsrlvq %xmm1, %xmm0, %xmm0
2428; CHECK-NEXT:    ret{{[l|q]}}
2429  %res = call <2 x i64> @llvm.x86.avx2.psrlv.q(<2 x i64> %a0, <2 x i64> %a1)
2430  ret <2 x i64> %res
2431}
2432declare <2 x i64> @llvm.x86.avx2.psrlv.q(<2 x i64>, <2 x i64>) nounwind readnone
2433
2434define <4 x i64> @test_mm256_srlv_epi64(<4 x i64> %a0, <4 x i64> %a1) {
2435; CHECK-LABEL: test_mm256_srlv_epi64:
2436; CHECK:       # %bb.0:
2437; CHECK-NEXT:    vpsrlvq %ymm1, %ymm0, %ymm0
2438; CHECK-NEXT:    ret{{[l|q]}}
2439  %res = call <4 x i64> @llvm.x86.avx2.psrlv.q.256(<4 x i64> %a0, <4 x i64> %a1)
2440  ret <4 x i64> %res
2441}
2442declare <4 x i64> @llvm.x86.avx2.psrlv.q.256(<4 x i64>, <4 x i64>) nounwind readnone
2443
2444define <4 x i64> @test_mm256_stream_load_si256(<4 x i64> *%a0) {
2445; X86-LABEL: test_mm256_stream_load_si256:
2446; X86:       # %bb.0:
2447; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
2448; X86-NEXT:    vmovntdqa (%eax), %ymm0
2449; X86-NEXT:    retl
2450;
2451; X64-LABEL: test_mm256_stream_load_si256:
2452; X64:       # %bb.0:
2453; X64-NEXT:    vmovntdqa (%rdi), %ymm0
2454; X64-NEXT:    retq
2455  %arg0 = bitcast <4 x i64> *%a0 to i8*
2456  %res = call <4 x i64> @llvm.x86.avx2.movntdqa(i8* %arg0)
2457  ret <4 x i64> %res
2458}
2459declare <4 x i64> @llvm.x86.avx2.movntdqa(i8*) nounwind readonly
2460
2461define <4 x i64> @test_mm256_sub_epi8(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2462; CHECK-LABEL: test_mm256_sub_epi8:
2463; CHECK:       # %bb.0:
2464; CHECK-NEXT:    vpsubb %ymm1, %ymm0, %ymm0
2465; CHECK-NEXT:    ret{{[l|q]}}
2466  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
2467  %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
2468  %res = sub <32 x i8> %arg0, %arg1
2469  %bc = bitcast <32 x i8> %res to <4 x i64>
2470  ret <4 x i64> %bc
2471}
2472
2473define <4 x i64> @test_mm256_sub_epi16(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2474; CHECK-LABEL: test_mm256_sub_epi16:
2475; CHECK:       # %bb.0:
2476; CHECK-NEXT:    vpsubw %ymm1, %ymm0, %ymm0
2477; CHECK-NEXT:    ret{{[l|q]}}
2478  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2479  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
2480  %res = sub <16 x i16> %arg0, %arg1
2481  %bc = bitcast <16 x i16> %res to <4 x i64>
2482  ret <4 x i64> %bc
2483}
2484
2485define <4 x i64> @test_mm256_sub_epi32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2486; CHECK-LABEL: test_mm256_sub_epi32:
2487; CHECK:       # %bb.0:
2488; CHECK-NEXT:    vpsubd %ymm1, %ymm0, %ymm0
2489; CHECK-NEXT:    ret{{[l|q]}}
2490  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2491  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
2492  %res = sub <8 x i32> %arg0, %arg1
2493  %bc = bitcast <8 x i32> %res to <4 x i64>
2494  ret <4 x i64> %bc
2495}
2496
2497define <4 x i64> @test_mm256_sub_epi64(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2498; CHECK-LABEL: test_mm256_sub_epi64:
2499; CHECK:       # %bb.0:
2500; CHECK-NEXT:    vpsubq %ymm1, %ymm0, %ymm0
2501; CHECK-NEXT:    ret{{[l|q]}}
2502  %res = sub <4 x i64> %a0, %a1
2503  ret <4 x i64> %res
2504}
2505
2506define <4 x i64> @test_mm256_subs_epi8(<4 x i64> %a0, <4 x i64> %a1) {
2507; CHECK-LABEL: test_mm256_subs_epi8:
2508; CHECK:       # %bb.0:
2509; CHECK-NEXT:    vpsubsb %ymm1, %ymm0, %ymm0
2510; CHECK-NEXT:    ret{{[l|q]}}
2511  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
2512  %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
2513  %res = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> %arg0, <32 x i8> %arg1)
2514  %bc = bitcast <32 x i8> %res to <4 x i64>
2515  ret <4 x i64> %bc
2516}
2517declare <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8>, <32 x i8>) nounwind readnone
2518
2519define <4 x i64> @test_mm256_subs_epi16(<4 x i64> %a0, <4 x i64> %a1) {
2520; CHECK-LABEL: test_mm256_subs_epi16:
2521; CHECK:       # %bb.0:
2522; CHECK-NEXT:    vpsubsw %ymm1, %ymm0, %ymm0
2523; CHECK-NEXT:    ret{{[l|q]}}
2524  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2525  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
2526  %res = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> %arg0, <16 x i16> %arg1)
2527  %bc = bitcast <16 x i16> %res to <4 x i64>
2528  ret <4 x i64> %bc
2529}
2530declare <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16>, <16 x i16>) nounwind readnone
2531
2532define <4 x i64> @test_mm256_subs_epu8(<4 x i64> %a0, <4 x i64> %a1) {
2533; CHECK-LABEL: test_mm256_subs_epu8:
2534; CHECK:       # %bb.0:
2535; CHECK-NEXT:    vpsubusb %ymm1, %ymm0, %ymm0
2536; CHECK-NEXT:    ret{{[l|q]}}
2537  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
2538  %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
2539  %res = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> %arg0, <32 x i8> %arg1)
2540  %bc = bitcast <32 x i8> %res to <4 x i64>
2541  ret <4 x i64> %bc
2542}
2543declare <32 x i8> @llvm.usub.sat.v32i8(<32 x i8>, <32 x i8>)
2544
2545define <4 x i64> @test_mm256_subs_epu16(<4 x i64> %a0, <4 x i64> %a1) {
2546; CHECK-LABEL: test_mm256_subs_epu16:
2547; CHECK:       # %bb.0:
2548; CHECK-NEXT:    vpsubusw %ymm1, %ymm0, %ymm0
2549; CHECK-NEXT:    ret{{[l|q]}}
2550  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2551  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
2552  %res = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> %arg0, <16 x i16> %arg1)
2553  %bc = bitcast <16 x i16> %res to <4 x i64>
2554  ret <4 x i64> %bc
2555}
2556declare <16 x i16> @llvm.usub.sat.v16i16(<16 x i16>, <16 x i16>)
2557
2558define <4 x i64> @test_mm256_unpackhi_epi8(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2559; CHECK-LABEL: test_mm256_unpackhi_epi8:
2560; CHECK:       # %bb.0:
2561; CHECK-NEXT:    vpunpckhbw {{.*#+}} ymm0 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
2562; CHECK-NEXT:    ret{{[l|q]}}
2563  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
2564  %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
2565  %res = shufflevector <32 x i8> %arg0, <32 x i8> %arg1, <32 x i32> <i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63>
2566  %bc = bitcast <32 x i8> %res to <4 x i64>
2567  ret <4 x i64> %bc
2568}
2569
2570define <4 x i64> @test_mm256_unpackhi_epi16(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2571; CHECK-LABEL: test_mm256_unpackhi_epi16:
2572; CHECK:       # %bb.0:
2573; CHECK-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15]
2574; CHECK-NEXT:    ret{{[l|q]}}
2575  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2576  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
2577  %res = shufflevector <16 x i16> %arg0, <16 x i16> %arg1, <16 x i32> <i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
2578  %bc = bitcast <16 x i16> %res to <4 x i64>
2579  ret <4 x i64> %bc
2580}
2581
2582define <4 x i64> @test_mm256_unpackhi_epi32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2583; CHECK-LABEL: test_mm256_unpackhi_epi32:
2584; CHECK:       # %bb.0:
2585; CHECK-NEXT:    vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
2586; CHECK-NEXT:    ret{{[l|q]}}
2587  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2588  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
2589  %res = shufflevector <8 x i32> %arg0, <8 x i32> %arg1, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
2590  %bc = bitcast <8 x i32> %res to <4 x i64>
2591  ret <4 x i64> %bc
2592}
2593
2594define <4 x i64> @test_mm256_unpackhi_epi64(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2595; CHECK-LABEL: test_mm256_unpackhi_epi64:
2596; CHECK:       # %bb.0:
2597; CHECK-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
2598; CHECK-NEXT:    ret{{[l|q]}}
2599  %res = shufflevector <4 x i64> %a0, <4 x i64> %a1, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
2600  ret <4 x i64> %res
2601}
2602
2603define <4 x i64> @test_mm256_unpacklo_epi8(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2604; CHECK-LABEL: test_mm256_unpacklo_epi8:
2605; CHECK:       # %bb.0:
2606; CHECK-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
2607; CHECK-NEXT:    ret{{[l|q]}}
2608  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
2609  %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
2610  %res = shufflevector <32 x i8> %arg0, <32 x i8> %arg1, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55>
2611  %bc = bitcast <32 x i8> %res to <4 x i64>
2612  ret <4 x i64> %bc
2613}
2614
2615define <4 x i64> @test_mm256_unpacklo_epi16(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2616; CHECK-LABEL: test_mm256_unpacklo_epi16:
2617; CHECK:       # %bb.0:
2618; CHECK-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11]
2619; CHECK-NEXT:    ret{{[l|q]}}
2620  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2621  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
2622  %res = shufflevector <16 x i16> %arg0, <16 x i16> %arg1, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27>
2623  %bc = bitcast <16 x i16> %res to <4 x i64>
2624  ret <4 x i64> %bc
2625}
2626
2627define <4 x i64> @test_mm256_unpacklo_epi32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2628; CHECK-LABEL: test_mm256_unpacklo_epi32:
2629; CHECK:       # %bb.0:
2630; CHECK-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
2631; CHECK-NEXT:    ret{{[l|q]}}
2632  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2633  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
2634  %res = shufflevector <8 x i32> %arg0, <8 x i32> %arg1, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
2635  %bc = bitcast <8 x i32> %res to <4 x i64>
2636  ret <4 x i64> %bc
2637}
2638
2639define <4 x i64> @test_mm256_unpacklo_epi64(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2640; CHECK-LABEL: test_mm256_unpacklo_epi64:
2641; CHECK:       # %bb.0:
2642; CHECK-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
2643; CHECK-NEXT:    ret{{[l|q]}}
2644  %res = shufflevector <4 x i64> %a0, <4 x i64> %a1, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
2645  ret <4 x i64> %res
2646}
2647
2648define <4 x i64> @test_mm256_xor_si256(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2649; CHECK-LABEL: test_mm256_xor_si256:
2650; CHECK:       # %bb.0:
2651; CHECK-NEXT:    vxorps %ymm1, %ymm0, %ymm0
2652; CHECK-NEXT:    ret{{[l|q]}}
2653  %res = xor <4 x i64> %a0, %a1
2654  ret <4 x i64> %res
2655}
2656
2657declare <8 x float> @llvm.x86.avx.cmp.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone
2658
2659declare <4 x double> @llvm.x86.avx.cmp.pd.256(<4 x double>, <4 x double>, i8) nounwind readnone
2660