1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=CHECK --check-prefix=X86
3; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=CHECK --check-prefix=X64
5; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/avx2-builtins.c
7define <4 x i64> @test_mm256_abs_epi8(<4 x i64> %a0) {
8; CHECK-LABEL: test_mm256_abs_epi8:
9; CHECK:       # %bb.0:
10; CHECK-NEXT:    vpabsb %ymm0, %ymm0
11; CHECK-NEXT:    ret{{[l|q]}}
12  %arg = bitcast <4 x i64> %a0 to <32 x i8>
13  %abs = call <32 x i8> @llvm.abs.v32i8(<32 x i8> %arg, i1 false)
14  %res = bitcast <32 x i8> %abs to <4 x i64>
15  ret <4 x i64> %res
17declare <32 x i8> @llvm.abs.v32i8(<32 x i8>, i1) nounwind readnone
19define <4 x i64> @test_mm256_abs_epi16(<4 x i64> %a0) {
20; CHECK-LABEL: test_mm256_abs_epi16:
21; CHECK:       # %bb.0:
22; CHECK-NEXT:    vpabsw %ymm0, %ymm0
23; CHECK-NEXT:    ret{{[l|q]}}
24  %arg = bitcast <4 x i64> %a0 to <16 x i16>
25  %abs = call <16 x i16> @llvm.abs.v16i16(<16 x i16> %arg, i1 false)
26  %res = bitcast <16 x i16> %abs to <4 x i64>
27  ret <4 x i64> %res
29declare <16 x i16> @llvm.abs.v16i16(<16 x i16>, i1) nounwind readnone
31define <4 x i64> @test_mm256_abs_epi32(<4 x i64> %a0) {
32; CHECK-LABEL: test_mm256_abs_epi32:
33; CHECK:       # %bb.0:
34; CHECK-NEXT:    vpabsd %ymm0, %ymm0
35; CHECK-NEXT:    ret{{[l|q]}}
36  %arg = bitcast <4 x i64> %a0 to <8 x i32>
37  %abs = call <8 x i32> @llvm.abs.v8i32(<8 x i32> %arg, i1 false)
38  %res = bitcast <8 x i32> %abs to <4 x i64>
39  ret <4 x i64> %res
41declare <8 x i32> @llvm.abs.v8i32(<8 x i32>, i1) nounwind readnone
43define <4 x i64> @test_mm256_add_epi8(<4 x i64> %a0, <4 x i64> %a1) nounwind {
44; CHECK-LABEL: test_mm256_add_epi8:
45; CHECK:       # %bb.0:
46; CHECK-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
47; CHECK-NEXT:    ret{{[l|q]}}
48  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
49  %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
50  %res = add <32 x i8> %arg0, %arg1
51  %bc = bitcast <32 x i8> %res to <4 x i64>
52  ret <4 x i64> %bc
55define <4 x i64> @test_mm256_add_epi16(<4 x i64> %a0, <4 x i64> %a1) nounwind {
56; CHECK-LABEL: test_mm256_add_epi16:
57; CHECK:       # %bb.0:
58; CHECK-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
59; CHECK-NEXT:    ret{{[l|q]}}
60  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
61  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
62  %res = add <16 x i16> %arg0, %arg1
63  %bc = bitcast <16 x i16> %res to <4 x i64>
64  ret <4 x i64> %bc
67define <4 x i64> @test_mm256_add_epi32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
68; CHECK-LABEL: test_mm256_add_epi32:
69; CHECK:       # %bb.0:
70; CHECK-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
71; CHECK-NEXT:    ret{{[l|q]}}
72  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
73  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
74  %res = add <8 x i32> %arg0, %arg1
75  %bc = bitcast <8 x i32> %res to <4 x i64>
76  ret <4 x i64> %bc
79define <4 x i64> @test_mm256_add_epi64(<4 x i64> %a0, <4 x i64> %a1) nounwind {
80; CHECK-LABEL: test_mm256_add_epi64:
81; CHECK:       # %bb.0:
82; CHECK-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
83; CHECK-NEXT:    ret{{[l|q]}}
84  %res = add <4 x i64> %a0, %a1
85  ret <4 x i64> %res
88define <4 x i64> @test_mm256_adds_epi8(<4 x i64> %a0, <4 x i64> %a1) {
89; CHECK-LABEL: test_mm256_adds_epi8:
90; CHECK:       # %bb.0:
91; CHECK-NEXT:    vpaddsb %ymm1, %ymm0, %ymm0
92; CHECK-NEXT:    ret{{[l|q]}}
93  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
94  %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
95  %res = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> %arg0, <32 x i8> %arg1)
96  %bc = bitcast <32 x i8> %res to <4 x i64>
97  ret <4 x i64> %bc
99declare <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8>, <32 x i8>) nounwind readnone
101define <4 x i64> @test_mm256_adds_epi16(<4 x i64> %a0, <4 x i64> %a1) {
102; CHECK-LABEL: test_mm256_adds_epi16:
103; CHECK:       # %bb.0:
104; CHECK-NEXT:    vpaddsw %ymm1, %ymm0, %ymm0
105; CHECK-NEXT:    ret{{[l|q]}}
106  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
107  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
108  %res = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> %arg0, <16 x i16> %arg1)
109  %bc = bitcast <16 x i16> %res to <4 x i64>
110  ret <4 x i64> %bc
112declare <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16>, <16 x i16>) nounwind readnone
114define <4 x i64> @test_mm256_adds_epu8(<4 x i64> %a0, <4 x i64> %a1) {
115; CHECK-LABEL: test_mm256_adds_epu8:
116; CHECK:       # %bb.0:
117; CHECK-NEXT:    vpaddusb %ymm1, %ymm0, %ymm0
118; CHECK-NEXT:    ret{{[l|q]}}
119  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
120  %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
121  %res = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> %arg0, <32 x i8> %arg1)
122  %bc = bitcast <32 x i8> %res to <4 x i64>
123  ret <4 x i64> %bc
125declare <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8>, <32 x i8>)
127define <4 x i64> @test_mm256_adds_epu16(<4 x i64> %a0, <4 x i64> %a1) {
128; CHECK-LABEL: test_mm256_adds_epu16:
129; CHECK:       # %bb.0:
130; CHECK-NEXT:    vpaddusw %ymm1, %ymm0, %ymm0
131; CHECK-NEXT:    ret{{[l|q]}}
132  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
133  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
134  %res = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> %arg0, <16 x i16> %arg1)
135  %bc = bitcast <16 x i16> %res to <4 x i64>
136  ret <4 x i64> %bc
138declare <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16>, <16 x i16>)
140define <4 x i64> @test_mm256_alignr_epi8(<4 x i64> %a0, <4 x i64> %a1) {
141; CHECK-LABEL: test_mm256_alignr_epi8:
142; CHECK:       # %bb.0:
143; CHECK-NEXT:    vpalignr {{.*#+}} ymm0 = ymm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm1[0,1],ymm0[18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm1[16,17]
144; CHECK-NEXT:    ret{{[l|q]}}
145  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
146  %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
147  %shuf = shufflevector <32 x i8> %arg0, <32 x i8> %arg1, <32 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 33, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48, i32 49>
148  %res = bitcast <32 x i8> %shuf to <4 x i64>
149  ret <4 x i64> %res
152define <4 x i64> @test2_mm256_alignr_epi8(<4 x i64> %a0, <4 x i64> %a1) {
153; CHECK-LABEL: test2_mm256_alignr_epi8:
154; CHECK:       # %bb.0:
155; CHECK-NEXT:    vpalignr {{.*#+}} ymm0 = ymm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm1[0],ymm0[17,18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm1[16]
156; CHECK-NEXT:    ret{{[l|q]}}
157  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
158  %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
159  %shuf = shufflevector <32 x i8> %arg0, <32 x i8> %arg1, <32 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48>
160  %res = bitcast <32 x i8> %shuf to <4 x i64>
161  ret <4 x i64> %res
164define <4 x i64> @test_mm256_and_si256(<4 x i64> %a0, <4 x i64> %a1) nounwind {
165; CHECK-LABEL: test_mm256_and_si256:
166; CHECK:       # %bb.0:
167; CHECK-NEXT:    vandps %ymm1, %ymm0, %ymm0
168; CHECK-NEXT:    ret{{[l|q]}}
169  %res = and <4 x i64> %a0, %a1
170  ret <4 x i64> %res
173define <4 x i64> @test_mm256_andnot_si256(<4 x i64> %a0, <4 x i64> %a1) nounwind {
174; CHECK-LABEL: test_mm256_andnot_si256:
175; CHECK:       # %bb.0:
176; CHECK-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
177; CHECK-NEXT:    vpxor %ymm2, %ymm0, %ymm0
178; CHECK-NEXT:    vpand %ymm1, %ymm0, %ymm0
179; CHECK-NEXT:    ret{{[l|q]}}
180  %not = xor <4 x i64> %a0, <i64 -1, i64 -1, i64 -1, i64 -1>
181  %res = and <4 x i64> %not, %a1
182  ret <4 x i64> %res
185define <4 x i64> @test_mm256_avg_epu8(<4 x i64> %a0, <4 x i64> %a1) nounwind {
186; CHECK-LABEL: test_mm256_avg_epu8:
187; CHECK:       # %bb.0:
188; CHECK-NEXT:    vpavgb %ymm1, %ymm0, %ymm0
189; CHECK-NEXT:    ret{{[l|q]}}
190  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
191  %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
192  %res = call <32 x i8> @llvm.x86.avx2.pavg.b(<32 x i8> %arg0, <32 x i8> %arg1)
193  %bc = bitcast <32 x i8> %res to <4 x i64>
194  ret <4 x i64> %bc
196declare <32 x i8> @llvm.x86.avx2.pavg.b(<32 x i8>, <32 x i8>) nounwind readnone
198define <4 x i64> @test_mm256_avg_epu16(<4 x i64> %a0, <4 x i64> %a1) nounwind {
199; CHECK-LABEL: test_mm256_avg_epu16:
200; CHECK:       # %bb.0:
201; CHECK-NEXT:    vpavgw %ymm1, %ymm0, %ymm0
202; CHECK-NEXT:    ret{{[l|q]}}
203  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
204  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
205  %res = call <16 x i16> @llvm.x86.avx2.pavg.w(<16 x i16> %arg0, <16 x i16> %arg1)
206  %bc = bitcast <16 x i16> %res to <4 x i64>
207  ret <4 x i64> %bc
209declare <16 x i16> @llvm.x86.avx2.pavg.w(<16 x i16>, <16 x i16>) nounwind readnone
211define <4 x i64> @test_mm256_blend_epi16(<4 x i64> %a0, <4 x i64> %a1) {
212; CHECK-LABEL: test_mm256_blend_epi16:
213; CHECK:       # %bb.0:
214; CHECK-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6,7,8],ymm1[9],ymm0[10,11,12,13,14,15]
215; CHECK-NEXT:    ret{{[l|q]}}
216  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
217  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
218  %shuf = shufflevector <16 x i16> %arg0, <16 x i16> %arg1, <16 x i32> <i32 0, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 25, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
219  %res = bitcast <16 x i16> %shuf to <4 x i64>
220  ret <4 x i64> %res
223define <2 x i64> @test_mm_blend_epi32(<2 x i64> %a0, <2 x i64> %a1) {
224; CHECK-LABEL: test_mm_blend_epi32:
225; CHECK:       # %bb.0:
226; CHECK-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
227; CHECK-NEXT:    ret{{[l|q]}}
228  %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
229  %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
230  %shuf = shufflevector <4 x i32> %arg0, <4 x i32> %arg1, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
231  %res = bitcast <4 x i32> %shuf to <2 x i64>
232  ret <2 x i64> %res
235define <4 x i64> @test_mm256_blend_epi32(<4 x i64> %a0, <4 x i64> %a1) {
236; CHECK-LABEL: test_mm256_blend_epi32:
237; CHECK:       # %bb.0:
238; CHECK-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6,7]
239; CHECK-NEXT:    ret{{[l|q]}}
240  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
241  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
242  %shuf = shufflevector <8 x i32> %arg0, <8 x i32> %arg1, <8 x i32> <i32 8, i32 1, i32 10, i32 3, i32 12, i32 13, i32 6, i32 7>
243  %res = bitcast <8 x i32> %shuf to <4 x i64>
244  ret <4 x i64> %res
247define <4 x i64> @test_mm256_blendv_epi8(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> %a2) {
248; CHECK-LABEL: test_mm256_blendv_epi8:
249; CHECK:       # %bb.0:
250; CHECK-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
251; CHECK-NEXT:    ret{{[l|q]}}
252  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
253  %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
254  %arg2 = bitcast <4 x i64> %a2 to <32 x i8>
255  %call = call <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8> %arg0, <32 x i8> %arg1, <32 x i8> %arg2)
256  %res = bitcast <32 x i8> %call to <4 x i64>
257  ret <4 x i64> %res
259declare <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8>, <32 x i8>, <32 x i8>) nounwind readnone
261define <2 x i64> @test_mm_broadcastb_epi8(<2 x i64> %a0) {
262; CHECK-LABEL: test_mm_broadcastb_epi8:
263; CHECK:       # %bb.0:
264; CHECK-NEXT:    vpbroadcastb %xmm0, %xmm0
265; CHECK-NEXT:    ret{{[l|q]}}
266  %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
267  %shuf = shufflevector <16 x i8> %arg0, <16 x i8> undef, <16 x i32> zeroinitializer
268  %res = bitcast <16 x i8> %shuf to <2 x i64>
269  ret <2 x i64> %res
272define <4 x i64> @test_mm256_broadcastb_epi8(<4 x i64> %a0) {
273; CHECK-LABEL: test_mm256_broadcastb_epi8:
274; CHECK:       # %bb.0:
275; CHECK-NEXT:    vpbroadcastb %xmm0, %ymm0
276; CHECK-NEXT:    ret{{[l|q]}}
277  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
278  %shuf = shufflevector <32 x i8> %arg0, <32 x i8> undef, <32 x i32> zeroinitializer
279  %res = bitcast <32 x i8> %shuf to <4 x i64>
280  ret <4 x i64> %res
283define <2 x i64> @test_mm_broadcastd_epi32(<2 x i64> %a0) {
284; CHECK-LABEL: test_mm_broadcastd_epi32:
285; CHECK:       # %bb.0:
286; CHECK-NEXT:    vbroadcastss %xmm0, %xmm0
287; CHECK-NEXT:    ret{{[l|q]}}
288  %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
289  %shuf = shufflevector <4 x i32> %arg0, <4 x i32> undef, <4 x i32> zeroinitializer
290  %res = bitcast <4 x i32> %shuf to <2 x i64>
291  ret <2 x i64> %res
294define <4 x i64> @test_mm256_broadcastd_epi32(<4 x i64> %a0) {
295; CHECK-LABEL: test_mm256_broadcastd_epi32:
296; CHECK:       # %bb.0:
297; CHECK-NEXT:    vbroadcastss %xmm0, %ymm0
298; CHECK-NEXT:    ret{{[l|q]}}
299  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
300  %shuf = shufflevector <8 x i32> %arg0, <8 x i32> undef, <8 x i32> zeroinitializer
301  %res = bitcast <8 x i32> %shuf to <4 x i64>
302  ret <4 x i64> %res
305define <2 x i64> @test_mm_broadcastq_epi64(<2 x i64> %a0) {
306; CHECK-LABEL: test_mm_broadcastq_epi64:
307; CHECK:       # %bb.0:
308; CHECK-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
309; CHECK-NEXT:    ret{{[l|q]}}
310  %res = shufflevector <2 x i64> %a0, <2 x i64> undef, <2 x i32> zeroinitializer
311  ret <2 x i64> %res
314define <4 x i64> @test_mm256_broadcastq_epi64(<4 x i64> %a0) {
315; CHECK-LABEL: test_mm256_broadcastq_epi64:
316; CHECK:       # %bb.0:
317; CHECK-NEXT:    vbroadcastsd %xmm0, %ymm0
318; CHECK-NEXT:    ret{{[l|q]}}
319  %res = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> zeroinitializer
320  ret <4 x i64> %res
323define <2 x double> @test_mm_broadcastsd_pd(<2 x double> %a0) {
324; CHECK-LABEL: test_mm_broadcastsd_pd:
325; CHECK:       # %bb.0:
326; CHECK-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
327; CHECK-NEXT:    ret{{[l|q]}}
328  %res = shufflevector <2 x double> %a0, <2 x double> undef, <2 x i32> zeroinitializer
329  ret <2 x double> %res
332define <4 x double> @test_mm256_broadcastsd_pd(<4 x double> %a0) {
333; CHECK-LABEL: test_mm256_broadcastsd_pd:
334; CHECK:       # %bb.0:
335; CHECK-NEXT:    vbroadcastsd %xmm0, %ymm0
336; CHECK-NEXT:    ret{{[l|q]}}
337  %res = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> zeroinitializer
338  ret <4 x double> %res
341define <4 x i64> @test_mm256_broadcastsi128_si256(<2 x i64> %a0) {
342; CHECK-LABEL: test_mm256_broadcastsi128_si256:
343; CHECK:       # %bb.0:
344; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
345; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
346; CHECK-NEXT:    ret{{[l|q]}}
347  %res = shufflevector <2 x i64> %a0, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
348  ret <4 x i64> %res
351define <4 x i64> @test_mm256_broadcastsi128_si256_mem(<2 x i64>* %p0) {
352; X86-LABEL: test_mm256_broadcastsi128_si256_mem:
353; X86:       # %bb.0:
354; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
355; X86-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
356; X86-NEXT:    retl
358; X64-LABEL: test_mm256_broadcastsi128_si256_mem:
359; X64:       # %bb.0:
360; X64-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
361; X64-NEXT:    retq
362  %a0 = load <2 x i64>, <2 x i64>* %p0
363  %res = shufflevector <2 x i64> %a0, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
364  ret <4 x i64> %res
367define <4 x float> @test_mm_broadcastss_ps(<4 x float> %a0) {
368; CHECK-LABEL: test_mm_broadcastss_ps:
369; CHECK:       # %bb.0:
370; CHECK-NEXT:    vbroadcastss %xmm0, %xmm0
371; CHECK-NEXT:    ret{{[l|q]}}
372  %res = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> zeroinitializer
373  ret <4 x float> %res
376define <8 x float> @test_mm256_broadcastss_ps(<8 x float> %a0) {
377; CHECK-LABEL: test_mm256_broadcastss_ps:
378; CHECK:       # %bb.0:
379; CHECK-NEXT:    vbroadcastss %xmm0, %ymm0
380; CHECK-NEXT:    ret{{[l|q]}}
381  %res = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> zeroinitializer
382  ret <8 x float> %res
385define <2 x i64> @test_mm_broadcastw_epi16(<2 x i64> %a0) {
386; CHECK-LABEL: test_mm_broadcastw_epi16:
387; CHECK:       # %bb.0:
388; CHECK-NEXT:    vpbroadcastw %xmm0, %xmm0
389; CHECK-NEXT:    ret{{[l|q]}}
390  %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
391  %shuf = shufflevector <8 x i16> %arg0, <8 x i16> undef, <8 x i32> zeroinitializer
392  %res = bitcast <8 x i16> %shuf to <2 x i64>
393  ret <2 x i64> %res
396define <4 x i64> @test_mm256_broadcastw_epi16(<4 x i64> %a0) {
397; CHECK-LABEL: test_mm256_broadcastw_epi16:
398; CHECK:       # %bb.0:
399; CHECK-NEXT:    vpbroadcastw %xmm0, %ymm0
400; CHECK-NEXT:    ret{{[l|q]}}
401  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
402  %shuf = shufflevector <16 x i16> %arg0, <16 x i16> undef, <16 x i32> zeroinitializer
403  %res = bitcast <16 x i16> %shuf to <4 x i64>
404  ret <4 x i64> %res
407define <4 x i64> @test_mm256_bslli_epi128(<4 x i64> %a0) {
408; CHECK-LABEL: test_mm256_bslli_epi128:
409; CHECK:       # %bb.0:
410; CHECK-NEXT:    vpslldq {{.*#+}} ymm0 = zero,zero,zero,ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12],zero,zero,zero,ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28]
411; CHECK-NEXT:    ret{{[l|q]}}
412  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
413  %shuf = shufflevector <32 x i8> zeroinitializer, <32 x i8> %arg0, <32 x i32> <i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60>
414  %res = bitcast <32 x i8> %shuf to <4 x i64>
415  ret <4 x i64> %res
418define <4 x i64> @test_mm256_bsrli_epi128(<4 x i64> %a0) {
419; CHECK-LABEL: test_mm256_bsrli_epi128:
420; CHECK:       # %bb.0:
421; CHECK-NEXT:    vpsrldq {{.*#+}} ymm0 = ymm0[3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,ymm0[19,20,21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero
422; CHECK-NEXT:    ret{{[l|q]}}
423  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
424  %shuf = shufflevector <32 x i8> %arg0, <32 x i8> zeroinitializer, <32 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50>
425  %res = bitcast <32 x i8> %shuf to <4 x i64>
426  ret <4 x i64> %res
429define <4 x i64> @test_mm256_cmpeq_epi8(<4 x i64> %a0, <4 x i64> %a1) nounwind {
430; CHECK-LABEL: test_mm256_cmpeq_epi8:
431; CHECK:       # %bb.0:
432; CHECK-NEXT:    vpcmpeqb %ymm1, %ymm0, %ymm0
433; CHECK-NEXT:    ret{{[l|q]}}
434  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
435  %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
436  %cmp = icmp eq <32 x i8> %arg0, %arg1
437  %res = sext <32 x i1> %cmp to <32 x i8>
438  %bc = bitcast <32 x i8> %res to <4 x i64>
439  ret <4 x i64> %bc
442define <4 x i64> @test_mm256_cmpeq_epi16(<4 x i64> %a0, <4 x i64> %a1) nounwind {
443; CHECK-LABEL: test_mm256_cmpeq_epi16:
444; CHECK:       # %bb.0:
445; CHECK-NEXT:    vpcmpeqw %ymm1, %ymm0, %ymm0
446; CHECK-NEXT:    ret{{[l|q]}}
447  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
448  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
449  %cmp = icmp eq <16 x i16> %arg0, %arg1
450  %res = sext <16 x i1> %cmp to <16 x i16>
451  %bc = bitcast <16 x i16> %res to <4 x i64>
452  ret <4 x i64> %bc
455define <4 x i64> @test_mm256_cmpeq_epi32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
456; CHECK-LABEL: test_mm256_cmpeq_epi32:
457; CHECK:       # %bb.0:
458; CHECK-NEXT:    vpcmpeqd %ymm1, %ymm0, %ymm0
459; CHECK-NEXT:    ret{{[l|q]}}
460  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
461  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
462  %cmp = icmp eq <8 x i32> %arg0, %arg1
463  %res = sext <8 x i1> %cmp to <8 x i32>
464  %bc = bitcast <8 x i32> %res to <4 x i64>
465  ret <4 x i64> %bc
468define <4 x i64> @test_mm256_cmpeq_epi64(<4 x i64> %a0, <4 x i64> %a1) nounwind {
469; CHECK-LABEL: test_mm256_cmpeq_epi64:
470; CHECK:       # %bb.0:
471; CHECK-NEXT:    vpcmpeqq %ymm1, %ymm0, %ymm0
472; CHECK-NEXT:    ret{{[l|q]}}
473  %cmp = icmp eq <4 x i64> %a0, %a1
474  %res = sext <4 x i1> %cmp to <4 x i64>
475  ret <4 x i64> %res
478define <4 x i64> @test_mm256_cmpgt_epi8(<4 x i64> %a0, <4 x i64> %a1) nounwind {
479; CHECK-LABEL: test_mm256_cmpgt_epi8:
480; CHECK:       # %bb.0:
481; CHECK-NEXT:    vpcmpgtb %ymm1, %ymm0, %ymm0
482; CHECK-NEXT:    ret{{[l|q]}}
483  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
484  %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
485  %cmp = icmp sgt <32 x i8> %arg0, %arg1
486  %res = sext <32 x i1> %cmp to <32 x i8>
487  %bc = bitcast <32 x i8> %res to <4 x i64>
488  ret <4 x i64> %bc
491define <4 x i64> @test_mm256_cmpgt_epi16(<4 x i64> %a0, <4 x i64> %a1) nounwind {
492; CHECK-LABEL: test_mm256_cmpgt_epi16:
493; CHECK:       # %bb.0:
494; CHECK-NEXT:    vpcmpgtw %ymm1, %ymm0, %ymm0
495; CHECK-NEXT:    ret{{[l|q]}}
496  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
497  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
498  %cmp = icmp sgt <16 x i16> %arg0, %arg1
499  %res = sext <16 x i1> %cmp to <16 x i16>
500  %bc = bitcast <16 x i16> %res to <4 x i64>
501  ret <4 x i64> %bc
504define <4 x i64> @test_mm256_cmpgt_epi32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
505; CHECK-LABEL: test_mm256_cmpgt_epi32:
506; CHECK:       # %bb.0:
507; CHECK-NEXT:    vpcmpgtd %ymm1, %ymm0, %ymm0
508; CHECK-NEXT:    ret{{[l|q]}}
509  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
510  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
511  %cmp = icmp sgt <8 x i32> %arg0, %arg1
512  %res = sext <8 x i1> %cmp to <8 x i32>
513  %bc = bitcast <8 x i32> %res to <4 x i64>
514  ret <4 x i64> %bc
517define <4 x i64> @test_mm256_cmpgt_epi64(<4 x i64> %a0, <4 x i64> %a1) nounwind {
518; CHECK-LABEL: test_mm256_cmpgt_epi64:
519; CHECK:       # %bb.0:
520; CHECK-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm0
521; CHECK-NEXT:    ret{{[l|q]}}
522  %cmp = icmp sgt <4 x i64> %a0, %a1
523  %res = sext <4 x i1> %cmp to <4 x i64>
524  ret <4 x i64> %res
527define <4 x i64> @test_mm256_cvtepi8_epi16(<2 x i64> %a0) {
528; CHECK-LABEL: test_mm256_cvtepi8_epi16:
529; CHECK:       # %bb.0:
530; CHECK-NEXT:    vpmovsxbw %xmm0, %ymm0
531; CHECK-NEXT:    ret{{[l|q]}}
532  %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
533  %ext = sext <16 x i8> %arg0 to <16 x i16>
534  %res = bitcast <16 x i16> %ext to <4 x i64>
535  ret <4 x i64> %res
538define <4 x i64> @test_mm256_cvtepi8_epi32(<2 x i64> %a0) {
539; CHECK-LABEL: test_mm256_cvtepi8_epi32:
540; CHECK:       # %bb.0:
541; CHECK-NEXT:    vpmovsxbd %xmm0, %ymm0
542; CHECK-NEXT:    ret{{[l|q]}}
543  %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
544  %shuf = shufflevector <16 x i8> %arg0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
545  %ext = sext <8 x i8> %shuf to <8 x i32>
546  %res = bitcast <8 x i32> %ext to <4 x i64>
547  ret <4 x i64> %res
550define <4 x i64> @test_mm256_cvtepi8_epi64(<2 x i64> %a0) {
551; CHECK-LABEL: test_mm256_cvtepi8_epi64:
552; CHECK:       # %bb.0:
553; CHECK-NEXT:    vpmovsxbq %xmm0, %ymm0
554; CHECK-NEXT:    ret{{[l|q]}}
555  %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
556  %shuf = shufflevector <16 x i8> %arg0, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
557  %ext = sext <4 x i8> %shuf to <4 x i64>
558  ret <4 x i64> %ext
561define <4 x i64> @test_mm256_cvtepi16_epi32(<2 x i64> %a0) {
562; CHECK-LABEL: test_mm256_cvtepi16_epi32:
563; CHECK:       # %bb.0:
564; CHECK-NEXT:    vpmovsxwd %xmm0, %ymm0
565; CHECK-NEXT:    ret{{[l|q]}}
566  %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
567  %ext = sext <8 x i16> %arg0 to <8 x i32>
568  %res = bitcast <8 x i32> %ext to <4 x i64>
569  ret <4 x i64> %res
572define <4 x i64> @test_mm256_cvtepi16_epi64(<2 x i64> %a0) {
573; CHECK-LABEL: test_mm256_cvtepi16_epi64:
574; CHECK:       # %bb.0:
575; CHECK-NEXT:    vpmovsxwq %xmm0, %ymm0
576; CHECK-NEXT:    ret{{[l|q]}}
577  %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
578  %shuf = shufflevector <8 x i16> %arg0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
579  %ext = sext <4 x i16> %shuf to <4 x i64>
580  ret <4 x i64> %ext
583define <4 x i64> @test_mm256_cvtepi32_epi64(<2 x i64> %a0) {
584; CHECK-LABEL: test_mm256_cvtepi32_epi64:
585; CHECK:       # %bb.0:
586; CHECK-NEXT:    vpmovsxdq %xmm0, %ymm0
587; CHECK-NEXT:    ret{{[l|q]}}
588  %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
589  %ext = sext <4 x i32> %arg0 to <4 x i64>
590  ret <4 x i64> %ext
593define <4 x i64> @test_mm256_cvtepu8_epi16(<2 x i64> %a0) {
594; CHECK-LABEL: test_mm256_cvtepu8_epi16:
595; CHECK:       # %bb.0:
596; CHECK-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
597; CHECK-NEXT:    ret{{[l|q]}}
598  %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
599  %ext = zext <16 x i8> %arg0 to <16 x i16>
600  %res = bitcast <16 x i16> %ext to <4 x i64>
601  ret <4 x i64> %res
604define <4 x i64> @test_mm256_cvtepu8_epi32(<2 x i64> %a0) {
605; CHECK-LABEL: test_mm256_cvtepu8_epi32:
606; CHECK:       # %bb.0:
607; CHECK-NEXT:    vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
608; CHECK-NEXT:    ret{{[l|q]}}
609  %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
610  %shuf = shufflevector <16 x i8> %arg0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
611  %ext = zext <8 x i8> %shuf to <8 x i32>
612  %res = bitcast <8 x i32> %ext to <4 x i64>
613  ret <4 x i64> %res
616define <4 x i64> @test_mm256_cvtepu8_epi64(<2 x i64> %a0) {
617; CHECK-LABEL: test_mm256_cvtepu8_epi64:
618; CHECK:       # %bb.0:
619; CHECK-NEXT:    vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
620; CHECK-NEXT:    ret{{[l|q]}}
621  %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
622  %shuf = shufflevector <16 x i8> %arg0, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
623  %ext = zext <4 x i8> %shuf to <4 x i64>
624  ret <4 x i64> %ext
627define <4 x i64> @test_mm256_cvtepu16_epi32(<2 x i64> %a0) {
628; CHECK-LABEL: test_mm256_cvtepu16_epi32:
629; CHECK:       # %bb.0:
630; CHECK-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
631; CHECK-NEXT:    ret{{[l|q]}}
632  %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
633  %ext = zext <8 x i16> %arg0 to <8 x i32>
634  %res = bitcast <8 x i32> %ext to <4 x i64>
635  ret <4 x i64> %res
638define <4 x i64> @test_mm256_cvtepu16_epi64(<2 x i64> %a0) {
639; CHECK-LABEL: test_mm256_cvtepu16_epi64:
640; CHECK:       # %bb.0:
641; CHECK-NEXT:    vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
642; CHECK-NEXT:    ret{{[l|q]}}
643  %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
644  %shuf = shufflevector <8 x i16> %arg0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
645  %ext = zext <4 x i16> %shuf to <4 x i64>
646  ret <4 x i64> %ext
649define <4 x i64> @test_mm256_cvtepu32_epi64(<2 x i64> %a0) {
650; CHECK-LABEL: test_mm256_cvtepu32_epi64:
651; CHECK:       # %bb.0:
652; CHECK-NEXT:    vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
653; CHECK-NEXT:    ret{{[l|q]}}
654  %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
655  %ext = zext <4 x i32> %arg0 to <4 x i64>
656  ret <4 x i64> %ext
659define <2 x i64> @test_mm256_extracti128_si256(<4 x i64> %a0) nounwind {
660; CHECK-LABEL: test_mm256_extracti128_si256:
661; CHECK:       # %bb.0:
662; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
663; CHECK-NEXT:    vzeroupper
664; CHECK-NEXT:    ret{{[l|q]}}
665  %res = shufflevector <4 x i64> %a0, <4 x i64> %a0, <2 x i32> <i32 2, i32 3>
666  ret <2 x i64> %res
669define <4 x i64> @test_mm256_hadd_epi16(<4 x i64> %a0, <4 x i64> %a1) {
670; CHECK-LABEL: test_mm256_hadd_epi16:
671; CHECK:       # %bb.0:
672; CHECK-NEXT:    vphaddw %ymm1, %ymm0, %ymm0
673; CHECK-NEXT:    ret{{[l|q]}}
674  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
675  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
676  %res = call <16 x i16> @llvm.x86.avx2.phadd.w(<16 x i16> %arg0, <16 x i16> %arg1)
677  %bc = bitcast <16 x i16> %res to <4 x i64>
678  ret <4 x i64> %bc
680declare <16 x i16> @llvm.x86.avx2.phadd.w(<16 x i16>, <16 x i16>) nounwind readnone
682define <4 x i64> @test_mm256_hadd_epi32(<4 x i64> %a0, <4 x i64> %a1) {
683; CHECK-LABEL: test_mm256_hadd_epi32:
684; CHECK:       # %bb.0:
685; CHECK-NEXT:    vphaddd %ymm1, %ymm0, %ymm0
686; CHECK-NEXT:    ret{{[l|q]}}
687  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
688  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
689  %res = call <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32> %arg0, <8 x i32> %arg1)
690  %bc = bitcast <8 x i32> %res to <4 x i64>
691  ret <4 x i64> %bc
693declare <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32>, <8 x i32>) nounwind readnone
695define <4 x i64> @test_mm256_hadds_epi16(<4 x i64> %a0, <4 x i64> %a1) {
696; CHECK-LABEL: test_mm256_hadds_epi16:
697; CHECK:       # %bb.0:
698; CHECK-NEXT:    vphaddsw %ymm1, %ymm0, %ymm0
699; CHECK-NEXT:    ret{{[l|q]}}
700  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
701  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
702  %res = call <16 x i16> @llvm.x86.avx2.phadd.sw(<16 x i16> %arg0, <16 x i16> %arg1)
703  %bc = bitcast <16 x i16> %res to <4 x i64>
704  ret <4 x i64> %bc
706declare <16 x i16> @llvm.x86.avx2.phadd.sw(<16 x i16>, <16 x i16>) nounwind readnone
708define <4 x i64> @test_mm256_hsub_epi16(<4 x i64> %a0, <4 x i64> %a1) {
709; CHECK-LABEL: test_mm256_hsub_epi16:
710; CHECK:       # %bb.0:
711; CHECK-NEXT:    vphsubw %ymm1, %ymm0, %ymm0
712; CHECK-NEXT:    ret{{[l|q]}}
713  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
714  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
715  %res = call <16 x i16> @llvm.x86.avx2.phsub.w(<16 x i16> %arg0, <16 x i16> %arg1)
716  %bc = bitcast <16 x i16> %res to <4 x i64>
717  ret <4 x i64> %bc
719declare <16 x i16> @llvm.x86.avx2.phsub.w(<16 x i16>, <16 x i16>) nounwind readnone
721define <4 x i64> @test_mm256_hsub_epi32(<4 x i64> %a0, <4 x i64> %a1) {
722; CHECK-LABEL: test_mm256_hsub_epi32:
723; CHECK:       # %bb.0:
724; CHECK-NEXT:    vphsubd %ymm1, %ymm0, %ymm0
725; CHECK-NEXT:    ret{{[l|q]}}
726  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
727  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
728  %res = call <8 x i32> @llvm.x86.avx2.phsub.d(<8 x i32> %arg0, <8 x i32> %arg1)
729  %bc = bitcast <8 x i32> %res to <4 x i64>
730  ret <4 x i64> %bc
732declare <8 x i32> @llvm.x86.avx2.phsub.d(<8 x i32>, <8 x i32>) nounwind readnone
734define <4 x i64> @test_mm256_hsubs_epi16(<4 x i64> %a0, <4 x i64> %a1) {
735; CHECK-LABEL: test_mm256_hsubs_epi16:
736; CHECK:       # %bb.0:
737; CHECK-NEXT:    vphsubsw %ymm1, %ymm0, %ymm0
738; CHECK-NEXT:    ret{{[l|q]}}
739  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
740  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
741  %res = call <16 x i16> @llvm.x86.avx2.phsub.sw(<16 x i16> %arg0, <16 x i16> %arg1)
742  %bc = bitcast <16 x i16> %res to <4 x i64>
743  ret <4 x i64> %bc
745declare <16 x i16> @llvm.x86.avx2.phsub.sw(<16 x i16>, <16 x i16>) nounwind readnone
747define <2 x i64> @test_mm_i32gather_epi32(i32 *%a0, <2 x i64> %a1) {
748; X86-LABEL: test_mm_i32gather_epi32:
749; X86:       # %bb.0:
750; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
751; X86-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
752; X86-NEXT:    vpxor %xmm1, %xmm1, %xmm1
753; X86-NEXT:    vpgatherdd %xmm2, (%eax,%xmm0,2), %xmm1
754; X86-NEXT:    vmovdqa %xmm1, %xmm0
755; X86-NEXT:    retl
757; X64-LABEL: test_mm_i32gather_epi32:
758; X64:       # %bb.0:
759; X64-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
760; X64-NEXT:    vpxor %xmm1, %xmm1, %xmm1
761; X64-NEXT:    vpgatherdd %xmm2, (%rdi,%xmm0,2), %xmm1
762; X64-NEXT:    vmovdqa %xmm1, %xmm0
763; X64-NEXT:    retq
764  %arg0 = bitcast i32 *%a0 to i8*
765  %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
766  %mask = bitcast <2 x i64> <i64 -1, i64 -1> to <4 x i32>
767  %call = call <4 x i32> @llvm.x86.avx2.gather.d.d(<4 x i32> undef, i8* %arg0, <4 x i32> %arg1, <4 x i32> %mask, i8 2)
768  %bc = bitcast <4 x i32> %call to <2 x i64>
769  ret <2 x i64> %bc
771declare <4 x i32> @llvm.x86.avx2.gather.d.d(<4 x i32>, i8*, <4 x i32>, <4 x i32>, i8) nounwind readonly
773define <2 x i64> @test_mm_mask_i32gather_epi32(<2 x i64> %a0, i32 *%a1, <2 x i64> %a2, <2 x i64> %a3) {
774; X86-LABEL: test_mm_mask_i32gather_epi32:
775; X86:       # %bb.0:
776; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
777; X86-NEXT:    vpgatherdd %xmm2, (%eax,%xmm1,2), %xmm0
778; X86-NEXT:    retl
780; X64-LABEL: test_mm_mask_i32gather_epi32:
781; X64:       # %bb.0:
782; X64-NEXT:    vpgatherdd %xmm2, (%rdi,%xmm1,2), %xmm0
783; X64-NEXT:    retq
784  %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
785  %arg1 = bitcast i32 *%a1 to i8*
786  %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
787  %arg3 = bitcast <2 x i64> %a3 to <4 x i32>
788  %call = call <4 x i32> @llvm.x86.avx2.gather.d.d(<4 x i32> %arg0, i8* %arg1, <4 x i32> %arg2, <4 x i32> %arg3, i8 2)
789  %bc = bitcast <4 x i32> %call to <2 x i64>
790  ret <2 x i64> %bc
793define <4 x i64> @test_mm256_i32gather_epi32(i32 *%a0, <4 x i64> %a1) {
794; X86-LABEL: test_mm256_i32gather_epi32:
795; X86:       # %bb.0:
796; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
797; X86-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
798; X86-NEXT:    vpxor %xmm1, %xmm1, %xmm1
799; X86-NEXT:    vpgatherdd %ymm2, (%eax,%ymm0,2), %ymm1
800; X86-NEXT:    vmovdqa %ymm1, %ymm0
801; X86-NEXT:    retl
803; X64-LABEL: test_mm256_i32gather_epi32:
804; X64:       # %bb.0:
805; X64-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
806; X64-NEXT:    vpxor %xmm1, %xmm1, %xmm1
807; X64-NEXT:    vpgatherdd %ymm2, (%rdi,%ymm0,2), %ymm1
808; X64-NEXT:    vmovdqa %ymm1, %ymm0
809; X64-NEXT:    retq
810  %arg0 = bitcast i32 *%a0 to i8*
811  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
812  %mask = bitcast <4 x i64> <i64 -1, i64 -1, i64 -1, i64 -1> to <8 x i32>
813  %call = call <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> undef, i8* %arg0, <8 x i32> %arg1, <8 x i32> %mask, i8 2)
814  %bc = bitcast <8 x i32> %call to <4 x i64>
815  ret <4 x i64> %bc
817declare <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32>, i8*, <8 x i32>, <8 x i32>, i8) nounwind readonly
819define <4 x i64> @test_mm256_mask_i32gather_epi32(<4 x i64> %a0, i32 *%a1, <4 x i64> %a2, <4 x i64> %a3) {
820; X86-LABEL: test_mm256_mask_i32gather_epi32:
821; X86:       # %bb.0:
822; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
823; X86-NEXT:    vpgatherdd %ymm2, (%eax,%ymm1,2), %ymm0
824; X86-NEXT:    retl
826; X64-LABEL: test_mm256_mask_i32gather_epi32:
827; X64:       # %bb.0:
828; X64-NEXT:    vpgatherdd %ymm2, (%rdi,%ymm1,2), %ymm0
829; X64-NEXT:    retq
830  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
831  %arg1 = bitcast i32 *%a1 to i8*
832  %arg2 = bitcast <4 x i64> %a2 to <8 x i32>
833  %arg3 = bitcast <4 x i64> %a3 to <8 x i32>
834  %call = call <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> %arg0, i8* %arg1, <8 x i32> %arg2, <8 x i32> %arg3, i8 2)
835  %bc = bitcast <8 x i32> %call to <4 x i64>
836  ret <4 x i64> %bc
839define <2 x i64> @test_mm_i32gather_epi64(i64 *%a0, <2 x i64> %a1) {
840; X86-LABEL: test_mm_i32gather_epi64:
841; X86:       # %bb.0:
842; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
843; X86-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
844; X86-NEXT:    vpxor %xmm1, %xmm1, %xmm1
845; X86-NEXT:    vpgatherdq %xmm2, (%eax,%xmm0,2), %xmm1
846; X86-NEXT:    vmovdqa %xmm1, %xmm0
847; X86-NEXT:    retl
849; X64-LABEL: test_mm_i32gather_epi64:
850; X64:       # %bb.0:
851; X64-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
852; X64-NEXT:    vpxor %xmm1, %xmm1, %xmm1
853; X64-NEXT:    vpgatherdq %xmm2, (%rdi,%xmm0,2), %xmm1
854; X64-NEXT:    vmovdqa %xmm1, %xmm0
855; X64-NEXT:    retq
856  %arg0 = bitcast i64 *%a0 to i8*
857  %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
858  %res = call <2 x i64> @llvm.x86.avx2.gather.d.q(<2 x i64> undef, i8* %arg0, <4 x i32> %arg1, <2 x i64> <i64 -1, i64 -1>, i8 2)
859  ret <2 x i64> %res
861declare <2 x i64> @llvm.x86.avx2.gather.d.q(<2 x i64>, i8*, <4 x i32>, <2 x i64>, i8) nounwind readonly
863define <2 x i64> @test_mm_mask_i32gather_epi64(<2 x i64> %a0, i64 *%a1, <2 x i64> %a2, <2 x i64> %a3) {
864; X86-LABEL: test_mm_mask_i32gather_epi64:
865; X86:       # %bb.0:
866; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
867; X86-NEXT:    vpgatherdq %xmm2, (%eax,%xmm1,2), %xmm0
868; X86-NEXT:    retl
870; X64-LABEL: test_mm_mask_i32gather_epi64:
871; X64:       # %bb.0:
872; X64-NEXT:    vpgatherdq %xmm2, (%rdi,%xmm1,2), %xmm0
873; X64-NEXT:    retq
874  %arg1 = bitcast i64 *%a1 to i8*
875  %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
876  %res = call <2 x i64> @llvm.x86.avx2.gather.d.q(<2 x i64> %a0, i8* %arg1, <4 x i32> %arg2, <2 x i64> %a3, i8 2)
877  ret <2 x i64> %res
880define <4 x i64> @test_mm256_i32gather_epi64(i64 *%a0, <2 x i64> %a1) {
881; X86-LABEL: test_mm256_i32gather_epi64:
882; X86:       # %bb.0:
883; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
884; X86-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
885; X86-NEXT:    vpxor %xmm1, %xmm1, %xmm1
886; X86-NEXT:    vpgatherdq %ymm2, (%eax,%xmm0,2), %ymm1
887; X86-NEXT:    vmovdqa %ymm1, %ymm0
888; X86-NEXT:    retl
890; X64-LABEL: test_mm256_i32gather_epi64:
891; X64:       # %bb.0:
892; X64-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
893; X64-NEXT:    vpxor %xmm1, %xmm1, %xmm1
894; X64-NEXT:    vpgatherdq %ymm2, (%rdi,%xmm0,2), %ymm1
895; X64-NEXT:    vmovdqa %ymm1, %ymm0
896; X64-NEXT:    retq
897  %arg0 = bitcast i64 *%a0 to i8*
898  %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
899  %res = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8* %arg0, <4 x i32> %arg1, <4 x i64> <i64 -1, i64 -1, i64 -1, i64 -1>, i8 2)
900  ret <4 x i64> %res
902declare <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64>, i8*, <4 x i32>, <4 x i64>, i8) nounwind readonly
904define <4 x i64> @test_mm256_mask_i32gather_epi64(<4 x i64> %a0, i64 *%a1, <2 x i64> %a2, <4 x i64> %a3) {
905; X86-LABEL: test_mm256_mask_i32gather_epi64:
906; X86:       # %bb.0:
907; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
908; X86-NEXT:    vpgatherdq %ymm2, (%eax,%xmm1,2), %ymm0
909; X86-NEXT:    retl
911; X64-LABEL: test_mm256_mask_i32gather_epi64:
912; X64:       # %bb.0:
913; X64-NEXT:    vpgatherdq %ymm2, (%rdi,%xmm1,2), %ymm0
914; X64-NEXT:    retq
915  %arg1 = bitcast i64 *%a1 to i8*
916  %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
917  %res = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> %a0, i8* %arg1, <4 x i32> %arg2, <4 x i64> %a3, i8 2)
918  ret <4 x i64> %res
921define <2 x double> @test_mm_i32gather_pd(double *%a0, <2 x i64> %a1) {
922; X86-LABEL: test_mm_i32gather_pd:
923; X86:       # %bb.0:
924; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
925; X86-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
926; X86-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
927; X86-NEXT:    vgatherdpd %xmm2, (%eax,%xmm0,2), %xmm1
928; X86-NEXT:    vmovapd %xmm1, %xmm0
929; X86-NEXT:    retl
931; X64-LABEL: test_mm_i32gather_pd:
932; X64:       # %bb.0:
933; X64-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
934; X64-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
935; X64-NEXT:    vgatherdpd %xmm2, (%rdi,%xmm0,2), %xmm1
936; X64-NEXT:    vmovapd %xmm1, %xmm0
937; X64-NEXT:    retq
938  %arg0 = bitcast double *%a0 to i8*
939  %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
940  %cmp = fcmp oeq <2 x double> zeroinitializer, zeroinitializer
941  %sext = sext <2 x i1> %cmp to <2 x i64>
942  %mask = bitcast <2 x i64> %sext to <2 x double>
943  %res = call <2 x double> @llvm.x86.avx2.gather.d.pd(<2 x double> undef, i8* %arg0, <4 x i32> %arg1, <2 x double> %mask, i8 2)
944  ret <2 x double> %res
946declare <2 x double> @llvm.x86.avx2.gather.d.pd(<2 x double>, i8*, <4 x i32>, <2 x double>, i8) nounwind readonly
948define <2 x double> @test_mm_mask_i32gather_pd(<2 x double> %a0, double *%a1, <2 x i64> %a2, <2 x double> %a3) {
949; X86-LABEL: test_mm_mask_i32gather_pd:
950; X86:       # %bb.0:
951; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
952; X86-NEXT:    vgatherdpd %xmm2, (%eax,%xmm1,2), %xmm0
953; X86-NEXT:    retl
955; X64-LABEL: test_mm_mask_i32gather_pd:
956; X64:       # %bb.0:
957; X64-NEXT:    vgatherdpd %xmm2, (%rdi,%xmm1,2), %xmm0
958; X64-NEXT:    retq
959  %arg1 = bitcast double *%a1 to i8*
960  %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
961  %res = call <2 x double> @llvm.x86.avx2.gather.d.pd(<2 x double> %a0, i8* %arg1, <4 x i32> %arg2, <2 x double> %a3, i8 2)
962  ret <2 x double> %res
965define <4 x double> @test_mm256_i32gather_pd(double *%a0, <2 x i64> %a1) {
966; X86-LABEL: test_mm256_i32gather_pd:
967; X86:       # %bb.0:
968; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
969; X86-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
970; X86-NEXT:    vcmpeqpd %ymm1, %ymm1, %ymm2
971; X86-NEXT:    vgatherdpd %ymm2, (%eax,%xmm0,2), %ymm1
972; X86-NEXT:    vmovapd %ymm1, %ymm0
973; X86-NEXT:    retl
975; X64-LABEL: test_mm256_i32gather_pd:
976; X64:       # %bb.0:
977; X64-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
978; X64-NEXT:    vcmpeqpd %ymm1, %ymm1, %ymm2
979; X64-NEXT:    vgatherdpd %ymm2, (%rdi,%xmm0,2), %ymm1
980; X64-NEXT:    vmovapd %ymm1, %ymm0
981; X64-NEXT:    retq
982  %arg0 = bitcast double *%a0 to i8*
983  %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
984  %mask = call <4 x double> @llvm.x86.avx.cmp.pd.256(<4 x double> zeroinitializer, <4 x double> zeroinitializer, i8 0)
985  %res = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8* %arg0, <4 x i32> %arg1, <4 x double> %mask, i8 2)
986  ret <4 x double> %res
988declare <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double>, i8*, <4 x i32>, <4 x double>, i8) nounwind readonly
990define <4 x double> @test_mm256_mask_i32gather_pd(<4 x double> %a0, double *%a1, <2 x i64> %a2, <4 x double> %a3) {
991; X86-LABEL: test_mm256_mask_i32gather_pd:
992; X86:       # %bb.0:
993; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
994; X86-NEXT:    vgatherdpd %ymm2, (%eax,%xmm1,2), %ymm0
995; X86-NEXT:    retl
997; X64-LABEL: test_mm256_mask_i32gather_pd:
998; X64:       # %bb.0:
999; X64-NEXT:    vgatherdpd %ymm2, (%rdi,%xmm1,2), %ymm0
1000; X64-NEXT:    retq
1001  %arg1 = bitcast double *%a1 to i8*
1002  %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
1003  %res = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> %a0, i8* %arg1, <4 x i32> %arg2, <4 x double> %a3, i8 2)
1004  ret <4 x double> %res
1007define <4 x float> @test_mm_i32gather_ps(float *%a0, <2 x i64> %a1) {
1008; X86-LABEL: test_mm_i32gather_ps:
1009; X86:       # %bb.0:
1010; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1011; X86-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1012; X86-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1013; X86-NEXT:    vgatherdps %xmm2, (%eax,%xmm0,2), %xmm1
1014; X86-NEXT:    vmovaps %xmm1, %xmm0
1015; X86-NEXT:    retl
1017; X64-LABEL: test_mm_i32gather_ps:
1018; X64:       # %bb.0:
1019; X64-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1020; X64-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1021; X64-NEXT:    vgatherdps %xmm2, (%rdi,%xmm0,2), %xmm1
1022; X64-NEXT:    vmovaps %xmm1, %xmm0
1023; X64-NEXT:    retq
1024  %arg0 = bitcast float *%a0 to i8*
1025  %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
1026  %cmp = fcmp oeq <4 x float> zeroinitializer, zeroinitializer
1027  %sext = sext <4 x i1> %cmp to <4 x i32>
1028  %mask = bitcast <4 x i32> %sext to <4 x float>
1029  %call = call <4 x float> @llvm.x86.avx2.gather.d.ps(<4 x float> undef, i8* %arg0, <4 x i32> %arg1, <4 x float> %mask, i8 2)
1030  ret <4 x float> %call
1032declare <4 x float> @llvm.x86.avx2.gather.d.ps(<4 x float>, i8*, <4 x i32>, <4 x float>, i8) nounwind readonly
1034define <4 x float> @test_mm_mask_i32gather_ps(<4 x float> %a0, float *%a1, <2 x i64> %a2, <4 x float> %a3) {
1035; X86-LABEL: test_mm_mask_i32gather_ps:
1036; X86:       # %bb.0:
1037; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1038; X86-NEXT:    vgatherdps %xmm2, (%eax,%xmm1,2), %xmm0
1039; X86-NEXT:    retl
1041; X64-LABEL: test_mm_mask_i32gather_ps:
1042; X64:       # %bb.0:
1043; X64-NEXT:    vgatherdps %xmm2, (%rdi,%xmm1,2), %xmm0
1044; X64-NEXT:    retq
1045  %arg1 = bitcast float *%a1 to i8*
1046  %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
1047  %call = call <4 x float> @llvm.x86.avx2.gather.d.ps(<4 x float> %a0, i8* %arg1, <4 x i32> %arg2, <4 x float> %a3, i8 2)
1048  ret <4 x float> %call
1051define <8 x float> @test_mm256_i32gather_ps(float *%a0, <4 x i64> %a1) {
1052; X86-LABEL: test_mm256_i32gather_ps:
1053; X86:       # %bb.0:
1054; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1055; X86-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1056; X86-NEXT:    vcmpeqps %ymm1, %ymm1, %ymm2
1057; X86-NEXT:    vgatherdps %ymm2, (%eax,%ymm0,2), %ymm1
1058; X86-NEXT:    vmovaps %ymm1, %ymm0
1059; X86-NEXT:    retl
1061; X64-LABEL: test_mm256_i32gather_ps:
1062; X64:       # %bb.0:
1063; X64-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1064; X64-NEXT:    vcmpeqps %ymm1, %ymm1, %ymm2
1065; X64-NEXT:    vgatherdps %ymm2, (%rdi,%ymm0,2), %ymm1
1066; X64-NEXT:    vmovaps %ymm1, %ymm0
1067; X64-NEXT:    retq
1068  %arg0 = bitcast float *%a0 to i8*
1069  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1070  %mask = call <8 x float> @llvm.x86.avx.cmp.ps.256(<8 x float> zeroinitializer, <8 x float> zeroinitializer, i8 0)
1071  %call = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> undef, i8* %arg0, <8 x i32> %arg1, <8 x float> %mask, i8 2)
1072  ret <8 x float> %call
1074declare <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float>, i8*, <8 x i32>, <8 x float>, i8) nounwind readonly
1076define <8 x float> @test_mm256_mask_i32gather_ps(<8 x float> %a0, float *%a1, <4 x i64> %a2, <8 x float> %a3) {
1077; X86-LABEL: test_mm256_mask_i32gather_ps:
1078; X86:       # %bb.0:
1079; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1080; X86-NEXT:    vgatherdps %ymm2, (%eax,%ymm1,2), %ymm0
1081; X86-NEXT:    retl
1083; X64-LABEL: test_mm256_mask_i32gather_ps:
1084; X64:       # %bb.0:
1085; X64-NEXT:    vgatherdps %ymm2, (%rdi,%ymm1,2), %ymm0
1086; X64-NEXT:    retq
1087  %arg1 = bitcast float *%a1 to i8*
1088  %arg2 = bitcast <4 x i64> %a2 to <8 x i32>
1089  %call = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> %a0, i8* %arg1, <8 x i32> %arg2, <8 x float> %a3, i8 2)
1090  ret <8 x float> %call
1093define <2 x i64> @test_mm_i64gather_epi32(i32 *%a0, <2 x i64> %a1) {
1094; X86-LABEL: test_mm_i64gather_epi32:
1095; X86:       # %bb.0:
1096; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1097; X86-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1098; X86-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1099; X86-NEXT:    vpgatherqd %xmm2, (%eax,%xmm0,2), %xmm1
1100; X86-NEXT:    vmovdqa %xmm1, %xmm0
1101; X86-NEXT:    retl
1103; X64-LABEL: test_mm_i64gather_epi32:
1104; X64:       # %bb.0:
1105; X64-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1106; X64-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1107; X64-NEXT:    vpgatherqd %xmm2, (%rdi,%xmm0,2), %xmm1
1108; X64-NEXT:    vmovdqa %xmm1, %xmm0
1109; X64-NEXT:    retq
1110  %arg0 = bitcast i32 *%a0 to i8*
1111  %mask = bitcast <2 x i64> <i64 -1, i64 -1> to <4 x i32>
1112  %call = call <4 x i32> @llvm.x86.avx2.gather.q.d(<4 x i32> undef, i8* %arg0, <2 x i64> %a1, <4 x i32> %mask, i8 2)
1113  %bc = bitcast <4 x i32> %call to <2 x i64>
1114  ret <2 x i64> %bc
1116declare <4 x i32> @llvm.x86.avx2.gather.q.d(<4 x i32>, i8*, <2 x i64>, <4 x i32>, i8) nounwind readonly
1118define <2 x i64> @test_mm_mask_i64gather_epi32(<2 x i64> %a0, i32 *%a1, <2 x i64> %a2, <2 x i64> %a3) {
1119; X86-LABEL: test_mm_mask_i64gather_epi32:
1120; X86:       # %bb.0:
1121; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1122; X86-NEXT:    vpgatherqd %xmm2, (%eax,%xmm1,2), %xmm0
1123; X86-NEXT:    retl
1125; X64-LABEL: test_mm_mask_i64gather_epi32:
1126; X64:       # %bb.0:
1127; X64-NEXT:    vpgatherqd %xmm2, (%rdi,%xmm1,2), %xmm0
1128; X64-NEXT:    retq
1129  %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
1130  %arg1 = bitcast i32 *%a1 to i8*
1131  %arg3 = bitcast <2 x i64> %a3 to <4 x i32>
1132  %call = call <4 x i32> @llvm.x86.avx2.gather.q.d(<4 x i32> %arg0, i8* %arg1, <2 x i64> %a2, <4 x i32> %arg3, i8 2)
1133  %bc = bitcast <4 x i32> %call to <2 x i64>
1134  ret <2 x i64> %bc
1137define <2 x i64> @test_mm256_i64gather_epi32(i32 *%a0, <4 x i64> %a1) {
1138; X86-LABEL: test_mm256_i64gather_epi32:
1139; X86:       # %bb.0:
1140; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1141; X86-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1142; X86-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1143; X86-NEXT:    vpgatherqd %xmm2, (%eax,%ymm0,2), %xmm1
1144; X86-NEXT:    vmovdqa %xmm1, %xmm0
1145; X86-NEXT:    vzeroupper
1146; X86-NEXT:    retl
1148; X64-LABEL: test_mm256_i64gather_epi32:
1149; X64:       # %bb.0:
1150; X64-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1151; X64-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1152; X64-NEXT:    vpgatherqd %xmm2, (%rdi,%ymm0,2), %xmm1
1153; X64-NEXT:    vmovdqa %xmm1, %xmm0
1154; X64-NEXT:    vzeroupper
1155; X64-NEXT:    retq
1156  %arg0 = bitcast i32 *%a0 to i8*
1157  %mask = bitcast <2 x i64> <i64 -1, i64 -1> to <4 x i32>
1158  %call = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8* %arg0, <4 x i64> %a1, <4 x i32> %mask, i8 2)
1159  %bc = bitcast <4 x i32> %call to <2 x i64>
1160  ret <2 x i64> %bc
1162declare <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32>, i8*, <4 x i64>, <4 x i32>, i8) nounwind readonly
1164define <2 x i64> @test_mm256_mask_i64gather_epi32(<2 x i64> %a0, i32 *%a1, <4 x i64> %a2, <2 x i64> %a3) {
1165; X86-LABEL: test_mm256_mask_i64gather_epi32:
1166; X86:       # %bb.0:
1167; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1168; X86-NEXT:    vpgatherqd %xmm2, (%eax,%ymm1,2), %xmm0
1169; X86-NEXT:    vzeroupper
1170; X86-NEXT:    retl
1172; X64-LABEL: test_mm256_mask_i64gather_epi32:
1173; X64:       # %bb.0:
1174; X64-NEXT:    vpgatherqd %xmm2, (%rdi,%ymm1,2), %xmm0
1175; X64-NEXT:    vzeroupper
1176; X64-NEXT:    retq
1177  %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
1178  %arg1 = bitcast i32 *%a1 to i8*
1179  %arg3 = bitcast <2 x i64> %a3 to <4 x i32>
1180  %call = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> %arg0, i8* %arg1, <4 x i64> %a2, <4 x i32> %arg3, i8 2)
1181  %bc = bitcast <4 x i32> %call to <2 x i64>
1182  ret <2 x i64> %bc
1185define <2 x i64> @test_mm_i64gather_epi64(i64 *%a0, <2 x i64> %a1) {
1186; X86-LABEL: test_mm_i64gather_epi64:
1187; X86:       # %bb.0:
1188; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1189; X86-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1190; X86-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1191; X86-NEXT:    vpgatherqq %xmm2, (%eax,%xmm0,2), %xmm1
1192; X86-NEXT:    vmovdqa %xmm1, %xmm0
1193; X86-NEXT:    retl
1195; X64-LABEL: test_mm_i64gather_epi64:
1196; X64:       # %bb.0:
1197; X64-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1198; X64-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1199; X64-NEXT:    vpgatherqq %xmm2, (%rdi,%xmm0,2), %xmm1
1200; X64-NEXT:    vmovdqa %xmm1, %xmm0
1201; X64-NEXT:    retq
1202  %arg0 = bitcast i64 *%a0 to i8*
1203  %call = call <2 x i64> @llvm.x86.avx2.gather.q.q(<2 x i64> undef, i8* %arg0, <2 x i64> %a1, <2 x i64> <i64 -1, i64 -1>, i8 2)
1204  ret <2 x i64> %call
1206declare <2 x i64> @llvm.x86.avx2.gather.q.q(<2 x i64>, i8*, <2 x i64>, <2 x i64>, i8) nounwind readonly
1208define <2 x i64> @test_mm_mask_i64gather_epi64(<2 x i64> %a0, i64 *%a1, <2 x i64> %a2, <2 x i64> %a3) {
1209; X86-LABEL: test_mm_mask_i64gather_epi64:
1210; X86:       # %bb.0:
1211; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1212; X86-NEXT:    vpgatherqq %xmm2, (%eax,%xmm1,2), %xmm0
1213; X86-NEXT:    retl
1215; X64-LABEL: test_mm_mask_i64gather_epi64:
1216; X64:       # %bb.0:
1217; X64-NEXT:    vpgatherqq %xmm2, (%rdi,%xmm1,2), %xmm0
1218; X64-NEXT:    retq
1219  %arg1 = bitcast i64 *%a1 to i8*
1220  %call = call <2 x i64> @llvm.x86.avx2.gather.q.q(<2 x i64> %a0, i8* %arg1, <2 x i64> %a2, <2 x i64> %a3, i8 2)
1221  ret <2 x i64> %call
1224define <4 x i64> @test_mm256_i64gather_epi64(i64 *%a0, <4 x i64> %a1) {
1225; X86-LABEL: test_mm256_i64gather_epi64:
1226; X86:       # %bb.0:
1227; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1228; X86-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
1229; X86-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1230; X86-NEXT:    vpgatherqq %ymm2, (%eax,%ymm0,2), %ymm1
1231; X86-NEXT:    vmovdqa %ymm1, %ymm0
1232; X86-NEXT:    retl
1234; X64-LABEL: test_mm256_i64gather_epi64:
1235; X64:       # %bb.0:
1236; X64-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
1237; X64-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1238; X64-NEXT:    vpgatherqq %ymm2, (%rdi,%ymm0,2), %ymm1
1239; X64-NEXT:    vmovdqa %ymm1, %ymm0
1240; X64-NEXT:    retq
1241  %arg0 = bitcast i64 *%a0 to i8*
1242  %call = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8* %arg0, <4 x i64> %a1, <4 x i64> <i64 -1, i64 -1, i64 -1, i64 -1>, i8 2)
1243  ret <4 x i64> %call
1245declare <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64>, i8*, <4 x i64>, <4 x i64>, i8) nounwind readonly
1247define <4 x i64> @test_mm256_mask_i64gather_epi64(<4 x i64> %a0, i64 *%a1, <4 x i64> %a2, <4 x i64> %a3) {
1248; X86-LABEL: test_mm256_mask_i64gather_epi64:
1249; X86:       # %bb.0:
1250; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1251; X86-NEXT:    vpgatherqq %ymm2, (%eax,%ymm1,2), %ymm0
1252; X86-NEXT:    retl
1254; X64-LABEL: test_mm256_mask_i64gather_epi64:
1255; X64:       # %bb.0:
1256; X64-NEXT:    vpgatherqq %ymm2, (%rdi,%ymm1,2), %ymm0
1257; X64-NEXT:    retq
1258  %arg1 = bitcast i64 *%a1 to i8*
1259  %call = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> %a0, i8* %arg1, <4 x i64> %a2, <4 x i64> %a3, i8 2)
1260  ret <4 x i64> %call
1263define <2 x double> @test_mm_i64gather_pd(double *%a0, <2 x i64> %a1) {
1264; X86-LABEL: test_mm_i64gather_pd:
1265; X86:       # %bb.0:
1266; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1267; X86-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1268; X86-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
1269; X86-NEXT:    vgatherqpd %xmm2, (%eax,%xmm0,2), %xmm1
1270; X86-NEXT:    vmovapd %xmm1, %xmm0
1271; X86-NEXT:    retl
1273; X64-LABEL: test_mm_i64gather_pd:
1274; X64:       # %bb.0:
1275; X64-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1276; X64-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
1277; X64-NEXT:    vgatherqpd %xmm2, (%rdi,%xmm0,2), %xmm1
1278; X64-NEXT:    vmovapd %xmm1, %xmm0
1279; X64-NEXT:    retq
1280  %arg0 = bitcast double *%a0 to i8*
1281  %cmp = fcmp oeq <2 x double> zeroinitializer, zeroinitializer
1282  %sext = sext <2 x i1> %cmp to <2 x i64>
1283  %mask = bitcast <2 x i64> %sext to <2 x double>
1284  %call = call <2 x double> @llvm.x86.avx2.gather.q.pd(<2 x double> undef, i8* %arg0, <2 x i64> %a1, <2 x double> %mask, i8 2)
1285  ret <2 x double> %call
1287declare <2 x double> @llvm.x86.avx2.gather.q.pd(<2 x double>, i8*, <2 x i64>, <2 x double>, i8) nounwind readonly
1289define <2 x double> @test_mm_mask_i64gather_pd(<2 x double> %a0, double *%a1, <2 x i64> %a2, <2 x double> %a3) {
1290; X86-LABEL: test_mm_mask_i64gather_pd:
1291; X86:       # %bb.0:
1292; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1293; X86-NEXT:    vgatherqpd %xmm2, (%eax,%xmm1,2), %xmm0
1294; X86-NEXT:    retl
1296; X64-LABEL: test_mm_mask_i64gather_pd:
1297; X64:       # %bb.0:
1298; X64-NEXT:    vgatherqpd %xmm2, (%rdi,%xmm1,2), %xmm0
1299; X64-NEXT:    retq
1300  %arg1 = bitcast double *%a1 to i8*
1301  %call = call <2 x double> @llvm.x86.avx2.gather.q.pd(<2 x double> %a0, i8* %arg1, <2 x i64> %a2, <2 x double> %a3, i8 2)
1302  ret <2 x double> %call
1305define <4 x double> @test_mm256_i64gather_pd(double *%a0, <4 x i64> %a1) {
1306; X86-LABEL: test_mm256_i64gather_pd:
1307; X86:       # %bb.0:
1308; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1309; X86-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
1310; X86-NEXT:    vcmpeqpd %ymm1, %ymm1, %ymm2
1311; X86-NEXT:    vgatherqpd %ymm2, (%eax,%ymm0,2), %ymm1
1312; X86-NEXT:    vmovapd %ymm1, %ymm0
1313; X86-NEXT:    retl
1315; X64-LABEL: test_mm256_i64gather_pd:
1316; X64:       # %bb.0:
1317; X64-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
1318; X64-NEXT:    vcmpeqpd %ymm1, %ymm1, %ymm2
1319; X64-NEXT:    vgatherqpd %ymm2, (%rdi,%ymm0,2), %ymm1
1320; X64-NEXT:    vmovapd %ymm1, %ymm0
1321; X64-NEXT:    retq
1322  %arg0 = bitcast double *%a0 to i8*
1323  %mask = call <4 x double> @llvm.x86.avx.cmp.pd.256(<4 x double> zeroinitializer, <4 x double> zeroinitializer, i8 0)
1324  %call = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8* %arg0, <4 x i64> %a1, <4 x double> %mask, i8 2)
1325  ret <4 x double> %call
1327declare <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double>, i8*, <4 x i64>, <4 x double>, i8) nounwind readonly
1329define <4 x double> @test_mm256_mask_i64gather_pd(<4 x double> %a0, i64 *%a1, <4 x i64> %a2, <4 x double> %a3) {
1330; X86-LABEL: test_mm256_mask_i64gather_pd:
1331; X86:       # %bb.0:
1332; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1333; X86-NEXT:    vgatherqpd %ymm2, (%eax,%ymm1,2), %ymm0
1334; X86-NEXT:    retl
1336; X64-LABEL: test_mm256_mask_i64gather_pd:
1337; X64:       # %bb.0:
1338; X64-NEXT:    vgatherqpd %ymm2, (%rdi,%ymm1,2), %ymm0
1339; X64-NEXT:    retq
1340  %arg1 = bitcast i64 *%a1 to i8*
1341  %call = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> %a0, i8* %arg1, <4 x i64> %a2, <4 x double> %a3, i8 2)
1342  ret <4 x double> %call
1345define <4 x float> @test_mm_i64gather_ps(float *%a0, <2 x i64> %a1) {
1346; X86-LABEL: test_mm_i64gather_ps:
1347; X86:       # %bb.0:
1348; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1349; X86-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1350; X86-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1351; X86-NEXT:    vgatherqps %xmm2, (%eax,%xmm0,2), %xmm1
1352; X86-NEXT:    vmovaps %xmm1, %xmm0
1353; X86-NEXT:    retl
1355; X64-LABEL: test_mm_i64gather_ps:
1356; X64:       # %bb.0:
1357; X64-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1358; X64-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1359; X64-NEXT:    vgatherqps %xmm2, (%rdi,%xmm0,2), %xmm1
1360; X64-NEXT:    vmovaps %xmm1, %xmm0
1361; X64-NEXT:    retq
1362  %arg0 = bitcast float *%a0 to i8*
1363  %cmp = fcmp oeq <4 x float> zeroinitializer, zeroinitializer
1364  %sext = sext <4 x i1> %cmp to <4 x i32>
1365  %mask = bitcast <4 x i32> %sext to <4 x float>
1366  %call = call <4 x float> @llvm.x86.avx2.gather.q.ps(<4 x float> undef, i8* %arg0, <2 x i64> %a1, <4 x float> %mask, i8 2)
1367  ret <4 x float> %call
1369declare <4 x float> @llvm.x86.avx2.gather.q.ps(<4 x float>, i8*, <2 x i64>, <4 x float>, i8) nounwind readonly
1371define <4 x float> @test_mm_mask_i64gather_ps(<4 x float> %a0, float *%a1, <2 x i64> %a2, <4 x float> %a3) {
1372; X86-LABEL: test_mm_mask_i64gather_ps:
1373; X86:       # %bb.0:
1374; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1375; X86-NEXT:    vgatherqps %xmm2, (%eax,%xmm1,2), %xmm0
1376; X86-NEXT:    retl
1378; X64-LABEL: test_mm_mask_i64gather_ps:
1379; X64:       # %bb.0:
1380; X64-NEXT:    vgatherqps %xmm2, (%rdi,%xmm1,2), %xmm0
1381; X64-NEXT:    retq
1382  %arg1 = bitcast float *%a1 to i8*
1383  %call = call <4 x float> @llvm.x86.avx2.gather.q.ps(<4 x float> %a0, i8* %arg1, <2 x i64> %a2, <4 x float> %a3, i8 2)
1384  ret <4 x float> %call
1387define <4 x float> @test_mm256_i64gather_ps(float *%a0, <4 x i64> %a1) {
1388; X86-LABEL: test_mm256_i64gather_ps:
1389; X86:       # %bb.0:
1390; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1391; X86-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1392; X86-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1393; X86-NEXT:    vgatherqps %xmm2, (%eax,%ymm0,2), %xmm1
1394; X86-NEXT:    vmovaps %xmm1, %xmm0
1395; X86-NEXT:    vzeroupper
1396; X86-NEXT:    retl
1398; X64-LABEL: test_mm256_i64gather_ps:
1399; X64:       # %bb.0:
1400; X64-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1401; X64-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1402; X64-NEXT:    vgatherqps %xmm2, (%rdi,%ymm0,2), %xmm1
1403; X64-NEXT:    vmovaps %xmm1, %xmm0
1404; X64-NEXT:    vzeroupper
1405; X64-NEXT:    retq
1406  %arg0 = bitcast float *%a0 to i8*
1407  %cmp = fcmp oeq <4 x float> zeroinitializer, zeroinitializer
1408  %sext = sext <4 x i1> %cmp to <4 x i32>
1409  %mask = bitcast <4 x i32> %sext to <4 x float>
1410  %call = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8* %arg0, <4 x i64> %a1, <4 x float> %mask, i8 2)
1411  ret <4 x float> %call
1413declare <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float>, i8*, <4 x i64>, <4 x float>, i8) nounwind readonly
1415define <4 x float> @test_mm256_mask_i64gather_ps(<4 x float> %a0, float *%a1, <4 x i64> %a2, <4 x float> %a3) {
1416; X86-LABEL: test_mm256_mask_i64gather_ps:
1417; X86:       # %bb.0:
1418; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1419; X86-NEXT:    vgatherqps %xmm2, (%eax,%ymm1,2), %xmm0
1420; X86-NEXT:    vzeroupper
1421; X86-NEXT:    retl
1423; X64-LABEL: test_mm256_mask_i64gather_ps:
1424; X64:       # %bb.0:
1425; X64-NEXT:    vgatherqps %xmm2, (%rdi,%ymm1,2), %xmm0
1426; X64-NEXT:    vzeroupper
1427; X64-NEXT:    retq
1428  %arg1 = bitcast float *%a1 to i8*
1429  %call = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> %a0, i8* %arg1, <4 x i64> %a2, <4 x float> %a3, i8 2)
1430  ret <4 x float> %call
1433define <4 x i64> @test0_mm256_inserti128_si256(<4 x i64> %a0, <2 x i64> %a1) nounwind {
1434; CHECK-LABEL: test0_mm256_inserti128_si256:
1435; CHECK:       # %bb.0:
1436; CHECK-NEXT:    # kill: def $xmm1 killed $xmm1 def $ymm1
1437; CHECK-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
1438; CHECK-NEXT:    ret{{[l|q]}}
1439  %ext = shufflevector <2 x i64> %a1, <2 x i64> %a1, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
1440  %res = shufflevector <4 x i64> %a0, <4 x i64> %ext, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
1441  ret <4 x i64> %res
1444define <4 x i64> @test1_mm256_inserti128_si256(<4 x i64> %a0, <2 x i64> %a1) nounwind {
1445; CHECK-LABEL: test1_mm256_inserti128_si256:
1446; CHECK:       # %bb.0:
1447; CHECK-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1448; CHECK-NEXT:    ret{{[l|q]}}
1449  %ext = shufflevector <2 x i64> %a1, <2 x i64> %a1, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
1450  %res = shufflevector <4 x i64> %a0, <4 x i64> %ext, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
1451  ret <4 x i64> %res
1454define <4 x i64> @test_mm256_madd_epi16(<4 x i64> %a0, <4 x i64> %a1) {
1455; CHECK-LABEL: test_mm256_madd_epi16:
1456; CHECK:       # %bb.0:
1457; CHECK-NEXT:    vpmaddwd %ymm1, %ymm0, %ymm0
1458; CHECK-NEXT:    ret{{[l|q]}}
1459  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
1460  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
1461  %res = call <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16> %arg0, <16 x i16> %arg1)
1462  %bc = bitcast <8 x i32> %res to <4 x i64>
1463  ret <4 x i64> %bc
1465declare <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16>, <16 x i16>) nounwind readnone
1467define <4 x i64> @test_mm256_maddubs_epi16(<4 x i64> %a0, <4 x i64> %a1) {
1468; CHECK-LABEL: test_mm256_maddubs_epi16:
1469; CHECK:       # %bb.0:
1470; CHECK-NEXT:    vpmaddubsw %ymm1, %ymm0, %ymm0
1471; CHECK-NEXT:    ret{{[l|q]}}
1472  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
1473  %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
1474  %res = call <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8> %arg0, <32 x i8> %arg1)
1475  %bc = bitcast <16 x i16> %res to <4 x i64>
1476  ret <4 x i64> %bc
1478declare <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8>, <32 x i8>) nounwind readnone
1480define <2 x i64> @test_mm_maskload_epi32(i32* %a0, <2 x i64> %a1) nounwind {
1481; X86-LABEL: test_mm_maskload_epi32:
1482; X86:       # %bb.0:
1483; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1484; X86-NEXT:    vpmaskmovd (%eax), %xmm0, %xmm0
1485; X86-NEXT:    retl
1487; X64-LABEL: test_mm_maskload_epi32:
1488; X64:       # %bb.0:
1489; X64-NEXT:    vpmaskmovd (%rdi), %xmm0, %xmm0
1490; X64-NEXT:    retq
1491  %arg0 = bitcast i32* %a0 to i8*
1492  %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
1493  %call = call <4 x i32> @llvm.x86.avx2.maskload.d(i8* %arg0, <4 x i32> %arg1)
1494  %bc = bitcast <4 x i32> %call to <2 x i64>
1495  ret <2 x i64> %bc
1497declare <4 x i32> @llvm.x86.avx2.maskload.d(i8*, <4 x i32>) nounwind readonly
1499define <4 x i64> @test_mm256_maskload_epi32(i32* %a0, <4 x i64> %a1) nounwind {
1500; X86-LABEL: test_mm256_maskload_epi32:
1501; X86:       # %bb.0:
1502; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1503; X86-NEXT:    vpmaskmovd (%eax), %ymm0, %ymm0
1504; X86-NEXT:    retl
1506; X64-LABEL: test_mm256_maskload_epi32:
1507; X64:       # %bb.0:
1508; X64-NEXT:    vpmaskmovd (%rdi), %ymm0, %ymm0
1509; X64-NEXT:    retq
1510  %arg0 = bitcast i32* %a0 to i8*
1511  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1512  %call = call <8 x i32> @llvm.x86.avx2.maskload.d.256(i8* %arg0, <8 x i32> %arg1)
1513  %bc = bitcast <8 x i32> %call to <4 x i64>
1514  ret <4 x i64> %bc
1516declare <8 x i32> @llvm.x86.avx2.maskload.d.256(i8*, <8 x i32>) nounwind readonly
1518define <2 x i64> @test_mm_maskload_epi64(i64* %a0, <2 x i64> %a1) nounwind {
1519; X86-LABEL: test_mm_maskload_epi64:
1520; X86:       # %bb.0:
1521; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1522; X86-NEXT:    vpmaskmovq (%eax), %xmm0, %xmm0
1523; X86-NEXT:    retl
1525; X64-LABEL: test_mm_maskload_epi64:
1526; X64:       # %bb.0:
1527; X64-NEXT:    vpmaskmovq (%rdi), %xmm0, %xmm0
1528; X64-NEXT:    retq
1529  %arg0 = bitcast i64* %a0 to i8*
1530  %res = call <2 x i64> @llvm.x86.avx2.maskload.q(i8* %arg0, <2 x i64> %a1)
1531  ret <2 x i64> %res
1533declare <2 x i64> @llvm.x86.avx2.maskload.q(i8*, <2 x i64>) nounwind readonly
1535define <4 x i64> @test_mm256_maskload_epi64(i64* %a0, <4 x i64> %a1) nounwind {
1536; X86-LABEL: test_mm256_maskload_epi64:
1537; X86:       # %bb.0:
1538; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1539; X86-NEXT:    vpmaskmovq (%eax), %ymm0, %ymm0
1540; X86-NEXT:    retl
1542; X64-LABEL: test_mm256_maskload_epi64:
1543; X64:       # %bb.0:
1544; X64-NEXT:    vpmaskmovq (%rdi), %ymm0, %ymm0
1545; X64-NEXT:    retq
1546  %arg0 = bitcast i64* %a0 to i8*
1547  %res = call <4 x i64> @llvm.x86.avx2.maskload.q.256(i8* %arg0, <4 x i64> %a1)
1548  ret <4 x i64> %res
1550declare <4 x i64> @llvm.x86.avx2.maskload.q.256(i8*, <4 x i64>) nounwind readonly
1552define void @test_mm_maskstore_epi32(float* %a0, <2 x i64> %a1, <2 x i64> %a2) nounwind {
1553; X86-LABEL: test_mm_maskstore_epi32:
1554; X86:       # %bb.0:
1555; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1556; X86-NEXT:    vpmaskmovd %xmm1, %xmm0, (%eax)
1557; X86-NEXT:    retl
1559; X64-LABEL: test_mm_maskstore_epi32:
1560; X64:       # %bb.0:
1561; X64-NEXT:    vpmaskmovd %xmm1, %xmm0, (%rdi)
1562; X64-NEXT:    retq
1563  %arg0 = bitcast float* %a0 to i8*
1564  %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
1565  %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
1566  call void @llvm.x86.avx2.maskstore.d(i8* %arg0, <4 x i32> %arg1, <4 x i32> %arg2)
1567  ret void
1569declare void @llvm.x86.avx2.maskstore.d(i8*, <4 x i32>, <4 x i32>) nounwind readnone
1571define void @test_mm256_maskstore_epi32(float* %a0, <4 x i64> %a1, <4 x i64> %a2) nounwind {
1572; X86-LABEL: test_mm256_maskstore_epi32:
1573; X86:       # %bb.0:
1574; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1575; X86-NEXT:    vpmaskmovd %ymm1, %ymm0, (%eax)
1576; X86-NEXT:    vzeroupper
1577; X86-NEXT:    retl
1579; X64-LABEL: test_mm256_maskstore_epi32:
1580; X64:       # %bb.0:
1581; X64-NEXT:    vpmaskmovd %ymm1, %ymm0, (%rdi)
1582; X64-NEXT:    vzeroupper
1583; X64-NEXT:    retq
1584  %arg0 = bitcast float* %a0 to i8*
1585  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1586  %arg2 = bitcast <4 x i64> %a2 to <8 x i32>
1587  call void @llvm.x86.avx2.maskstore.d.256(i8* %arg0, <8 x i32> %arg1, <8 x i32> %arg2)
1588  ret void
1590declare void @llvm.x86.avx2.maskstore.d.256(i8*, <8 x i32>, <8 x i32>) nounwind readnone
1592define void @test_mm_maskstore_epi64(i64* %a0, <2 x i64> %a1, <2 x i64> %a2) nounwind {
1593; X86-LABEL: test_mm_maskstore_epi64:
1594; X86:       # %bb.0:
1595; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1596; X86-NEXT:    vpmaskmovq %xmm1, %xmm0, (%eax)
1597; X86-NEXT:    retl
1599; X64-LABEL: test_mm_maskstore_epi64:
1600; X64:       # %bb.0:
1601; X64-NEXT:    vpmaskmovq %xmm1, %xmm0, (%rdi)
1602; X64-NEXT:    retq
1603  %arg0 = bitcast i64* %a0 to i8*
1604  call void @llvm.x86.avx2.maskstore.q(i8* %arg0, <2 x i64> %a1, <2 x i64> %a2)
1605  ret void
1607declare void @llvm.x86.avx2.maskstore.q(i8*, <2 x i64>, <2 x i64>) nounwind readnone
1609define void @test_mm256_maskstore_epi64(i64* %a0, <4 x i64> %a1, <4 x i64> %a2) nounwind {
1610; X86-LABEL: test_mm256_maskstore_epi64:
1611; X86:       # %bb.0:
1612; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1613; X86-NEXT:    vpmaskmovq %ymm1, %ymm0, (%eax)
1614; X86-NEXT:    vzeroupper
1615; X86-NEXT:    retl
1617; X64-LABEL: test_mm256_maskstore_epi64:
1618; X64:       # %bb.0:
1619; X64-NEXT:    vpmaskmovq %ymm1, %ymm0, (%rdi)
1620; X64-NEXT:    vzeroupper
1621; X64-NEXT:    retq
1622  %arg0 = bitcast i64* %a0 to i8*
1623  call void @llvm.x86.avx2.maskstore.q.256(i8* %arg0, <4 x i64> %a1, <4 x i64> %a2)
1624  ret void
1626declare void @llvm.x86.avx2.maskstore.q.256(i8*, <4 x i64>, <4 x i64>) nounwind readnone
1628define <4 x i64> @test_mm256_max_epi8(<4 x i64> %a0, <4 x i64> %a1) {
1629; CHECK-LABEL: test_mm256_max_epi8:
1630; CHECK:       # %bb.0:
1631; CHECK-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0
1632; CHECK-NEXT:    ret{{[l|q]}}
1633  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
1634  %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
1635  %sel = call <32 x i8> @llvm.smax.v32i8(<32 x i8> %arg0, <32 x i8> %arg1)
1636  %bc = bitcast <32 x i8> %sel to <4 x i64>
1637  ret <4 x i64> %bc
1639declare <32 x i8> @llvm.smax.v32i8(<32 x i8>, <32 x i8>)
1641define <4 x i64> @test_mm256_max_epi16(<4 x i64> %a0, <4 x i64> %a1) {
1642; CHECK-LABEL: test_mm256_max_epi16:
1643; CHECK:       # %bb.0:
1644; CHECK-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm0
1645; CHECK-NEXT:    ret{{[l|q]}}
1646  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
1647  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
1648  %sel = call <16 x i16> @llvm.smax.v16i16(<16 x i16> %arg0, <16 x i16> %arg1)
1649  %bc = bitcast <16 x i16> %sel to <4 x i64>
1650  ret <4 x i64> %bc
1652declare <16 x i16> @llvm.smax.v16i16(<16 x i16>, <16 x i16>)
1654define <4 x i64> @test_mm256_max_epi32(<4 x i64> %a0, <4 x i64> %a1) {
1655; CHECK-LABEL: test_mm256_max_epi32:
1656; CHECK:       # %bb.0:
1657; CHECK-NEXT:    vpmaxsd %ymm1, %ymm0, %ymm0
1658; CHECK-NEXT:    ret{{[l|q]}}
1659  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
1660  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1661  %sel = call <8 x i32> @llvm.smax.v8i32(<8 x i32> %arg0, <8 x i32> %arg1)
1662  %bc = bitcast <8 x i32> %sel to <4 x i64>
1663  ret <4 x i64> %bc
1665declare <8 x i32> @llvm.smax.v8i32(<8 x i32>, <8 x i32>)
1667define <4 x i64> @test_mm256_max_epu8(<4 x i64> %a0, <4 x i64> %a1) {
1668; CHECK-LABEL: test_mm256_max_epu8:
1669; CHECK:       # %bb.0:
1670; CHECK-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0
1671; CHECK-NEXT:    ret{{[l|q]}}
1672  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
1673  %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
1674  %sel = call <32 x i8> @llvm.umax.v32i8(<32 x i8> %arg0, <32 x i8> %arg1)
1675  %bc = bitcast <32 x i8> %sel to <4 x i64>
1676  ret <4 x i64> %bc
1678declare <32 x i8> @llvm.umax.v32i8(<32 x i8>, <32 x i8>)
1680define <4 x i64> @test_mm256_max_epu16(<4 x i64> %a0, <4 x i64> %a1) {
1681; CHECK-LABEL: test_mm256_max_epu16:
1682; CHECK:       # %bb.0:
1683; CHECK-NEXT:    vpmaxuw %ymm1, %ymm0, %ymm0
1684; CHECK-NEXT:    ret{{[l|q]}}
1685  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
1686  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
1687  %sel = call <16 x i16> @llvm.umax.v16i16(<16 x i16> %arg0, <16 x i16> %arg1)
1688  %bc = bitcast <16 x i16> %sel to <4 x i64>
1689  ret <4 x i64> %bc
1691declare <16 x i16> @llvm.umax.v16i16(<16 x i16>, <16 x i16>)
1693define <4 x i64> @test_mm256_max_epu32(<4 x i64> %a0, <4 x i64> %a1) {
1694; CHECK-LABEL: test_mm256_max_epu32:
1695; CHECK:       # %bb.0:
1696; CHECK-NEXT:    vpmaxud %ymm1, %ymm0, %ymm0
1697; CHECK-NEXT:    ret{{[l|q]}}
1698  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
1699  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1700  %sel = call <8 x i32> @llvm.umax.v8i32(<8 x i32> %arg0, <8 x i32> %arg1)
1701  %bc = bitcast <8 x i32> %sel to <4 x i64>
1702  ret <4 x i64> %bc
1704declare <8 x i32> @llvm.umax.v8i32(<8 x i32>, <8 x i32>)
1706define <4 x i64> @test_mm256_min_epi8(<4 x i64> %a0, <4 x i64> %a1) {
1707; CHECK-LABEL: test_mm256_min_epi8:
1708; CHECK:       # %bb.0:
1709; CHECK-NEXT:    vpminsb %ymm1, %ymm0, %ymm0
1710; CHECK-NEXT:    ret{{[l|q]}}
1711  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
1712  %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
1713  %sel = call <32 x i8> @llvm.smin.v32i8(<32 x i8> %arg0, <32 x i8> %arg1)
1714  %bc = bitcast <32 x i8> %sel to <4 x i64>
1715  ret <4 x i64> %bc
1717declare <32 x i8> @llvm.smin.v32i8(<32 x i8>, <32 x i8>)
1719define <4 x i64> @test_mm256_min_epi16(<4 x i64> %a0, <4 x i64> %a1) {
1720; CHECK-LABEL: test_mm256_min_epi16:
1721; CHECK:       # %bb.0:
1722; CHECK-NEXT:    vpminsw %ymm1, %ymm0, %ymm0
1723; CHECK-NEXT:    ret{{[l|q]}}
1724  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
1725  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
1726  %sel = call <16 x i16> @llvm.smin.v16i16(<16 x i16> %arg0, <16 x i16> %arg1)
1727  %bc = bitcast <16 x i16> %sel to <4 x i64>
1728  ret <4 x i64> %bc
1730declare <16 x i16> @llvm.smin.v16i16(<16 x i16>, <16 x i16>)
1732define <4 x i64> @test_mm256_min_epi32(<4 x i64> %a0, <4 x i64> %a1) {
1733; CHECK-LABEL: test_mm256_min_epi32:
1734; CHECK:       # %bb.0:
1735; CHECK-NEXT:    vpminsd %ymm1, %ymm0, %ymm0
1736; CHECK-NEXT:    ret{{[l|q]}}
1737  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
1738  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1739  %sel = call <8 x i32> @llvm.smin.v8i32(<8 x i32> %arg0, <8 x i32> %arg1)
1740  %bc = bitcast <8 x i32> %sel to <4 x i64>
1741  ret <4 x i64> %bc
1743declare <8 x i32> @llvm.smin.v8i32(<8 x i32>, <8 x i32>)
1745define <4 x i64> @test_mm256_min_epu8(<4 x i64> %a0, <4 x i64> %a1) {
1746; CHECK-LABEL: test_mm256_min_epu8:
1747; CHECK:       # %bb.0:
1748; CHECK-NEXT:    vpminub %ymm1, %ymm0, %ymm0
1749; CHECK-NEXT:    ret{{[l|q]}}
1750  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
1751  %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
1752  %sel = call <32 x i8> @llvm.umin.v32i8(<32 x i8> %arg0, <32 x i8> %arg1)
1753  %bc = bitcast <32 x i8> %sel to <4 x i64>
1754  ret <4 x i64> %bc
1756declare <32 x i8> @llvm.umin.v32i8(<32 x i8>, <32 x i8>)
1758define <4 x i64> @test_mm256_min_epu16(<4 x i64> %a0, <4 x i64> %a1) {
1759; CHECK-LABEL: test_mm256_min_epu16:
1760; CHECK:       # %bb.0:
1761; CHECK-NEXT:    vpminuw %ymm1, %ymm0, %ymm0
1762; CHECK-NEXT:    ret{{[l|q]}}
1763  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
1764  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
1765  %sel = call <16 x i16> @llvm.umin.v16i16(<16 x i16> %arg0, <16 x i16> %arg1)
1766  %bc = bitcast <16 x i16> %sel to <4 x i64>
1767  ret <4 x i64> %bc
1769declare <16 x i16> @llvm.umin.v16i16(<16 x i16>, <16 x i16>)
1771define <4 x i64> @test_mm256_min_epu32(<4 x i64> %a0, <4 x i64> %a1) {
1772; CHECK-LABEL: test_mm256_min_epu32:
1773; CHECK:       # %bb.0:
1774; CHECK-NEXT:    vpminud %ymm1, %ymm0, %ymm0
1775; CHECK-NEXT:    ret{{[l|q]}}
1776  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
1777  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1778  %sel = call <8 x i32> @llvm.umin.v8i32(<8 x i32> %arg0, <8 x i32> %arg1)
1779  %bc = bitcast <8 x i32> %sel to <4 x i64>
1780  ret <4 x i64> %bc
1782declare <8 x i32> @llvm.umin.v8i32(<8 x i32>, <8 x i32>)
1784define i32 @test_mm256_movemask_epi8(<4 x i64> %a0) nounwind {
1785; CHECK-LABEL: test_mm256_movemask_epi8:
1786; CHECK:       # %bb.0:
1787; CHECK-NEXT:    vpmovmskb %ymm0, %eax
1788; CHECK-NEXT:    vzeroupper
1789; CHECK-NEXT:    ret{{[l|q]}}
1790  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
1791  %res = call i32 @llvm.x86.avx2.pmovmskb(<32 x i8> %arg0)
1792  ret i32 %res
1794declare i32 @llvm.x86.avx2.pmovmskb(<32 x i8>) nounwind readnone
1796define <4 x i64> @test_mm256_mpsadbw_epu8(<4 x i64> %a0, <4 x i64> %a1) {
1797; CHECK-LABEL: test_mm256_mpsadbw_epu8:
1798; CHECK:       # %bb.0:
1799; CHECK-NEXT:    vmpsadbw $3, %ymm1, %ymm0, %ymm0
1800; CHECK-NEXT:    ret{{[l|q]}}
1801  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
1802  %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
1803  %call = call <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8> %arg0, <32 x i8> %arg1, i8 3)
1804  %bc = bitcast <16 x i16>  %call to <4 x i64>
1805  ret <4 x i64> %bc
1807declare <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8>, <32 x i8>, i8) nounwind readnone
1809define <4 x i64> @test_mm256_mul_epi32(<4 x i64> %a0, <4 x i64> %a1) {
1810; CHECK-LABEL: test_mm256_mul_epi32:
1811; CHECK:       # %bb.0:
1812; CHECK-NEXT:    vpmuldq %ymm1, %ymm0, %ymm0
1813; CHECK-NEXT:    ret{{[l|q]}}
1814  %A = shl <4 x i64> %a0, <i64 32, i64 32, i64 32, i64 32>
1815  %A1 = ashr exact <4 x i64> %A, <i64 32, i64 32, i64 32, i64 32>
1816  %B = shl <4 x i64> %a1, <i64 32, i64 32, i64 32, i64 32>
1817  %B1 = ashr exact <4 x i64> %B, <i64 32, i64 32, i64 32, i64 32>
1818  %res = mul nsw <4 x i64> %A1, %B1
1819  ret <4 x i64> %res
1821declare <4 x i64> @llvm.x86.avx2.pmul.dq(<8 x i32>, <8 x i32>) nounwind readnone
1823define <4 x i64> @test_mm256_mul_epu32(<4 x i64> %a0, <4 x i64> %a1) {
1824; CHECK-LABEL: test_mm256_mul_epu32:
1825; CHECK:       # %bb.0:
1826; CHECK-NEXT:    vpmuludq %ymm1, %ymm0, %ymm0
1827; CHECK-NEXT:    ret{{[l|q]}}
1828  %A = and <4 x i64> %a0, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
1829  %B = and <4 x i64> %a1, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
1830  %res = mul nuw <4 x i64> %A, %B
1831  ret <4 x i64> %res
1833declare <4 x i64> @llvm.x86.avx2.pmulu.dq(<8 x i32>, <8 x i32>) nounwind readnone
1835define <4 x i64> @test_mm256_mulhi_epi16(<4 x i64> %a0, <4 x i64> %a1) {
1836; CHECK-LABEL: test_mm256_mulhi_epi16:
1837; CHECK:       # %bb.0:
1838; CHECK-NEXT:    vpmulhw %ymm1, %ymm0, %ymm0
1839; CHECK-NEXT:    ret{{[l|q]}}
1840  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
1841  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
1842  %res = call <16 x i16> @llvm.x86.avx2.pmulh.w(<16 x i16> %arg0, <16 x i16> %arg1)
1843  %bc = bitcast <16 x i16> %res to <4 x i64>
1844  ret <4 x i64> %bc
1846declare <16 x i16> @llvm.x86.avx2.pmulh.w(<16 x i16>, <16 x i16>) nounwind readnone
1848define <4 x i64> @test_mm256_mulhi_epu16(<4 x i64> %a0, <4 x i64> %a1) {
1849; CHECK-LABEL: test_mm256_mulhi_epu16:
1850; CHECK:       # %bb.0:
1851; CHECK-NEXT:    vpmulhuw %ymm1, %ymm0, %ymm0
1852; CHECK-NEXT:    ret{{[l|q]}}
1853  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
1854  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
1855  %res = call <16 x i16> @llvm.x86.avx2.pmulhu.w(<16 x i16> %arg0, <16 x i16> %arg1)
1856  %bc = bitcast <16 x i16> %res to <4 x i64>
1857  ret <4 x i64> %bc
1859declare <16 x i16> @llvm.x86.avx2.pmulhu.w(<16 x i16>, <16 x i16>) nounwind readnone
1861define <4 x i64> @test_mm256_mulhrs_epi16(<4 x i64> %a0, <4 x i64> %a1) {
1862; CHECK-LABEL: test_mm256_mulhrs_epi16:
1863; CHECK:       # %bb.0:
1864; CHECK-NEXT:    vpmulhrsw %ymm1, %ymm0, %ymm0
1865; CHECK-NEXT:    ret{{[l|q]}}
1866  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
1867  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
1868  %res = call <16 x i16> @llvm.x86.avx2.pmul.hr.sw(<16 x i16> %arg0, <16 x i16> %arg1)
1869  %bc = bitcast <16 x i16> %res to <4 x i64>
1870  ret <4 x i64> %bc
1872declare <16 x i16> @llvm.x86.avx2.pmul.hr.sw(<16 x i16>, <16 x i16>) nounwind readnone
1874define <4 x i64> @test_mm256_mullo_epi16(<4 x i64> %a0, <4 x i64> %a1) {
1875; CHECK-LABEL: test_mm256_mullo_epi16:
1876; CHECK:       # %bb.0:
1877; CHECK-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
1878; CHECK-NEXT:    ret{{[l|q]}}
1879  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
1880  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
1881  %res = mul <16 x i16> %arg0, %arg1
1882  %bc = bitcast <16 x i16> %res to <4 x i64>
1883  ret <4 x i64> %bc
1886define <4 x i64> @test_mm256_mullo_epi32(<4 x i64> %a0, <4 x i64> %a1) {
1887; CHECK-LABEL: test_mm256_mullo_epi32:
1888; CHECK:       # %bb.0:
1889; CHECK-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
1890; CHECK-NEXT:    ret{{[l|q]}}
1891  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
1892  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1893  %res = mul <8 x i32> %arg0, %arg1
1894  %bc = bitcast <8 x i32> %res to <4 x i64>
1895  ret <4 x i64> %bc
1898define <4 x i64> @test_mm256_or_si256(<4 x i64> %a0, <4 x i64> %a1) nounwind {
1899; CHECK-LABEL: test_mm256_or_si256:
1900; CHECK:       # %bb.0:
1901; CHECK-NEXT:    vorps %ymm1, %ymm0, %ymm0
1902; CHECK-NEXT:    ret{{[l|q]}}
1903  %res = or <4 x i64> %a0, %a1
1904  ret <4 x i64> %res
1907define <4 x i64> @test_mm256_packs_epi16(<4 x i64> %a0, <4 x i64> %a1) {
1908; CHECK-LABEL: test_mm256_packs_epi16:
1909; CHECK:       # %bb.0:
1910; CHECK-NEXT:    vpacksswb %ymm1, %ymm0, %ymm0
1911; CHECK-NEXT:    ret{{[l|q]}}
1912  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
1913  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
1914  %call = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %arg0, <16 x i16> %arg1)
1915  %res = bitcast <32 x i8> %call to <4 x i64>
1916  ret <4 x i64> %res
1918declare <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16>, <16 x i16>) nounwind readnone
1920define <4 x i64> @test_mm256_packs_epi32(<4 x i64> %a0, <4 x i64> %a1) {
1921; CHECK-LABEL: test_mm256_packs_epi32:
1922; CHECK:       # %bb.0:
1923; CHECK-NEXT:    vpackssdw %ymm1, %ymm0, %ymm0
1924; CHECK-NEXT:    ret{{[l|q]}}
1925  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
1926  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1927  %call = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %arg0, <8 x i32> %arg1)
1928  %res = bitcast <16 x i16> %call to <4 x i64>
1929  ret <4 x i64> %res
1931declare <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32>, <8 x i32>) nounwind readnone
1933define <4 x i64> @test_mm256_packus_epi16(<4 x i64> %a0, <4 x i64> %a1) {
1934; CHECK-LABEL: test_mm256_packus_epi16:
1935; CHECK:       # %bb.0:
1936; CHECK-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0
1937; CHECK-NEXT:    ret{{[l|q]}}
1938  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
1939  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
1940  %call = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> %arg0, <16 x i16> %arg1)
1941  %res = bitcast <32 x i8> %call to <4 x i64>
1942  ret <4 x i64> %res
1944declare <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16>, <16 x i16>) nounwind readnone
1946define <4 x i64> @test_mm256_packus_epi32(<4 x i64> %a0, <4 x i64> %a1) {
1947; CHECK-LABEL: test_mm256_packus_epi32:
1948; CHECK:       # %bb.0:
1949; CHECK-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
1950; CHECK-NEXT:    ret{{[l|q]}}
1951  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
1952  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1953  %call = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %arg0, <8 x i32> %arg1)
1954  %res = bitcast <16 x i16> %call to <4 x i64>
1955  ret <4 x i64> %res
1957declare <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32>, <8 x i32>) nounwind readnone
1959define <4 x i64> @test_mm256_permute2x128_si256(<4 x i64> %a0, <4 x i64> %a1) {
1960; CHECK-LABEL: test_mm256_permute2x128_si256:
1961; CHECK:       # %bb.0:
1962; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
1963; CHECK-NEXT:    ret{{[l|q]}}
1964  %res = shufflevector <4 x i64> %a0, <4 x i64> %a1, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
1965  ret <4 x i64> %res
1967declare <4 x i64> @llvm.x86.avx2.vperm2i128(<4 x i64>, <4 x i64>, i8) nounwind readonly
1969define <4 x i64> @test_mm256_permute4x64_epi64(<4 x i64> %a0) {
1970; CHECK-LABEL: test_mm256_permute4x64_epi64:
1971; CHECK:       # %bb.0:
1972; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[3,0,2,0]
1973; CHECK-NEXT:    ret{{[l|q]}}
1974  %res = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> <i32 3, i32 0, i32 2, i32 0>
1975  ret <4 x i64> %res
1978define <4 x double> @test_mm256_permute4x64_pd(<4 x double> %a0) {
1979; CHECK-LABEL: test_mm256_permute4x64_pd:
1980; CHECK:       # %bb.0:
1981; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[1,2,1,0]
1982; CHECK-NEXT:    ret{{[l|q]}}
1983  %res = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> <i32 1, i32 2, i32 1, i32 0>
1984  ret <4 x double> %res
1987define <4 x i64> @test_mm256_permutevar8x32_epi32(<4 x i64> %a0, <4 x i64> %a1) {
1988; CHECK-LABEL: test_mm256_permutevar8x32_epi32:
1989; CHECK:       # %bb.0:
1990; CHECK-NEXT:    vpermps %ymm0, %ymm1, %ymm0
1991; CHECK-NEXT:    ret{{[l|q]}}
1992  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
1993  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1994  %call = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %arg0, <8 x i32> %arg1)
1995  %res = bitcast <8 x i32> %call to <4 x i64>
1996  ret <4 x i64> %res
1998declare <8 x i32> @llvm.x86.avx2.permd(<8 x i32>, <8 x i32>) nounwind readonly
2000define <8 x float> @test_mm256_permutevar8x32_ps(<8 x float> %a0, <4 x i64> %a1) {
2001; CHECK-LABEL: test_mm256_permutevar8x32_ps:
2002; CHECK:       # %bb.0:
2003; CHECK-NEXT:    vpermps %ymm0, %ymm1, %ymm0
2004; CHECK-NEXT:    ret{{[l|q]}}
2005  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
2006  %res = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a0, <8 x i32> %arg1)
2007  ret <8 x float> %res
2009declare <8 x float> @llvm.x86.avx2.permps(<8 x float>, <8 x i32>) nounwind readonly
2011define <4 x i64> @test_mm256_sad_epu8(<4 x i64> %a0, <4 x i64> %a1) {
2012; CHECK-LABEL: test_mm256_sad_epu8:
2013; CHECK:       # %bb.0:
2014; CHECK-NEXT:    vpsadbw %ymm1, %ymm0, %ymm0
2015; CHECK-NEXT:    ret{{[l|q]}}
2016  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
2017  %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
2018  %res = call <4 x i64> @llvm.x86.avx2.psad.bw(<32 x i8> %arg0, <32 x i8> %arg1)
2019  ret <4 x i64> %res
2021declare <4 x i64> @llvm.x86.avx2.psad.bw(<32 x i8>, <32 x i8>) nounwind readnone
2023define <4 x i64> @test_mm256_shuffle_epi32(<4 x i64> %a0) {
2024; CHECK-LABEL: test_mm256_shuffle_epi32:
2025; CHECK:       # %bb.0:
2026; CHECK-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,3,0,0,7,7,4,4]
2027; CHECK-NEXT:    ret{{[l|q]}}
2028  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2029  %shuf = shufflevector <8 x i32> %arg0, <8 x i32> undef, <8 x i32> <i32 3, i32 3, i32 0, i32 0, i32 7, i32 7, i32 4, i32 4>
2030  %res = bitcast <8 x i32> %shuf to <4 x i64>
2031  ret <4 x i64> %res
2034define <4 x i64> @test_mm256_shuffle_epi8(<4 x i64> %a0, <4 x i64> %a1) {
2035; CHECK-LABEL: test_mm256_shuffle_epi8:
2036; CHECK:       # %bb.0:
2037; CHECK-NEXT:    vpshufb %ymm1, %ymm0, %ymm0
2038; CHECK-NEXT:    ret{{[l|q]}}
2039  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
2040  %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
2041  %shuf = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %arg0, <32 x i8> %arg1)
2042  %res = bitcast <32 x i8> %shuf to <4 x i64>
2043  ret <4 x i64> %res
2045declare <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8>, <32 x i8>) nounwind readnone
2047define <4 x i64> @test_mm256_shufflehi_epi16(<4 x i64> %a0) {
2048; CHECK-LABEL: test_mm256_shufflehi_epi16:
2049; CHECK:       # %bb.0:
2050; CHECK-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,6,6,5,8,9,10,11,15,14,14,13]
2051; CHECK-NEXT:    ret{{[l|q]}}
2052  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2053  %shuf = shufflevector <16 x i16> %arg0, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 6, i32 6, i32 5, i32 8, i32 9, i32 10, i32 11, i32 15, i32 14, i32 14, i32 13>
2054  %res = bitcast <16 x i16> %shuf to <4 x i64>
2055  ret <4 x i64> %res
2058define <4 x i64> @test_mm256_shufflelo_epi16(<4 x i64> %a0) {
2059; CHECK-LABEL: test_mm256_shufflelo_epi16:
2060; CHECK:       # %bb.0:
2061; CHECK-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[3,0,1,1,4,5,6,7,11,8,9,9,12,13,14,15]
2062; CHECK-NEXT:    ret{{[l|q]}}
2063  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2064  %shuf = shufflevector <16 x i16> %arg0, <16 x i16> undef, <16 x i32> <i32 3, i32 0, i32 1, i32 1, i32 4, i32 5, i32 6, i32 7, i32 11, i32 8, i32 9, i32 9, i32 12, i32 13, i32 14, i32 15>
2065  %res = bitcast <16 x i16> %shuf to <4 x i64>
2066  ret <4 x i64> %res
2069define <4 x i64> @test_mm256_sign_epi8(<4 x i64> %a0, <4 x i64> %a1) {
2070; CHECK-LABEL: test_mm256_sign_epi8:
2071; CHECK:       # %bb.0:
2072; CHECK-NEXT:    vpsignb %ymm1, %ymm0, %ymm0
2073; CHECK-NEXT:    ret{{[l|q]}}
2074  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
2075  %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
2076  %call = call <32 x i8> @llvm.x86.avx2.psign.b(<32 x i8> %arg0, <32 x i8> %arg1)
2077  %res = bitcast <32 x i8> %call to <4 x i64>
2078  ret <4 x i64> %res
2080declare <32 x i8> @llvm.x86.avx2.psign.b(<32 x i8>, <32 x i8>) nounwind readnone
2082define <4 x i64> @test_mm256_sign_epi16(<4 x i64> %a0, <4 x i64> %a1) {
2083; CHECK-LABEL: test_mm256_sign_epi16:
2084; CHECK:       # %bb.0:
2085; CHECK-NEXT:    vpsignw %ymm1, %ymm0, %ymm0
2086; CHECK-NEXT:    ret{{[l|q]}}
2087  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2088  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
2089  %call = call <16 x i16> @llvm.x86.avx2.psign.w(<16 x i16> %arg0, <16 x i16> %arg1)
2090  %res = bitcast <16 x i16> %call to <4 x i64>
2091  ret <4 x i64> %res
2093declare <16 x i16> @llvm.x86.avx2.psign.w(<16 x i16>, <16 x i16>) nounwind readnone
2095define <4 x i64> @test_mm256_sign_epi32(<4 x i64> %a0, <4 x i64> %a1) {
2096; CHECK-LABEL: test_mm256_sign_epi32:
2097; CHECK:       # %bb.0:
2098; CHECK-NEXT:    vpsignd %ymm1, %ymm0, %ymm0
2099; CHECK-NEXT:    ret{{[l|q]}}
2100  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2101  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
2102  %call = call <8 x i32> @llvm.x86.avx2.psign.d(<8 x i32> %arg0, <8 x i32> %arg1)
2103  %res = bitcast <8 x i32> %call to <4 x i64>
2104  ret <4 x i64> %res
2106declare <8 x i32> @llvm.x86.avx2.psign.d(<8 x i32>, <8 x i32>) nounwind readnone
2108define <4 x i64> @test_mm256_sll_epi16(<4 x i64> %a0, <2 x i64> %a1) {
2109; CHECK-LABEL: test_mm256_sll_epi16:
2110; CHECK:       # %bb.0:
2111; CHECK-NEXT:    vpsllw %xmm1, %ymm0, %ymm0
2112; CHECK-NEXT:    ret{{[l|q]}}
2113  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2114  %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
2115  %res = call <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16> %arg0, <8 x i16> %arg1)
2116  %bc = bitcast <16 x i16> %res to <4 x i64>
2117  ret <4 x i64> %bc
2119declare <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16>, <8 x i16>) nounwind readnone
2121define <4 x i64> @test_mm256_sll_epi32(<4 x i64> %a0, <2 x i64> %a1) {
2122; CHECK-LABEL: test_mm256_sll_epi32:
2123; CHECK:       # %bb.0:
2124; CHECK-NEXT:    vpslld %xmm1, %ymm0, %ymm0
2125; CHECK-NEXT:    ret{{[l|q]}}
2126  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2127  %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
2128  %res = call <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32> %arg0, <4 x i32> %arg1)
2129  %bc = bitcast <8 x i32> %res to <4 x i64>
2130  ret <4 x i64> %bc
2132declare <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32>, <4 x i32>) nounwind readnone
2134define <4 x i64> @test_mm256_sll_epi64(<4 x i64> %a0, <2 x i64> %a1) {
2135; CHECK-LABEL: test_mm256_sll_epi64:
2136; CHECK:       # %bb.0:
2137; CHECK-NEXT:    vpsllq %xmm1, %ymm0, %ymm0
2138; CHECK-NEXT:    ret{{[l|q]}}
2139  %res = call <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64> %a0, <2 x i64> %a1)
2140  ret <4 x i64> %res
2142declare <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64>, <2 x i64>) nounwind readnone
2144define <4 x i64> @test_mm256_slli_epi16(<4 x i64> %a0) {
2145; CHECK-LABEL: test_mm256_slli_epi16:
2146; CHECK:       # %bb.0:
2147; CHECK-NEXT:    vpsllw $3, %ymm0, %ymm0
2148; CHECK-NEXT:    ret{{[l|q]}}
2149  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2150  %res = call <16 x i16> @llvm.x86.avx2.pslli.w(<16 x i16> %arg0, i32 3)
2151  %bc = bitcast <16 x i16> %res to <4 x i64>
2152  ret <4 x i64> %bc
2154declare <16 x i16> @llvm.x86.avx2.pslli.w(<16 x i16>, i32) nounwind readnone
2156define <4 x i64> @test_mm256_slli_epi32(<4 x i64> %a0) {
2157; CHECK-LABEL: test_mm256_slli_epi32:
2158; CHECK:       # %bb.0:
2159; CHECK-NEXT:    vpslld $3, %ymm0, %ymm0
2160; CHECK-NEXT:    ret{{[l|q]}}
2161  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2162  %res = call <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32> %arg0, i32 3)
2163  %bc = bitcast <8 x i32> %res to <4 x i64>
2164  ret <4 x i64> %bc
2166declare <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32>, i32) nounwind readnone
2168define <4 x i64> @test_mm256_slli_epi64(<4 x i64> %a0) {
2169; CHECK-LABEL: test_mm256_slli_epi64:
2170; CHECK:       # %bb.0:
2171; CHECK-NEXT:    vpsllq $3, %ymm0, %ymm0
2172; CHECK-NEXT:    ret{{[l|q]}}
2173  %res = call <4 x i64> @llvm.x86.avx2.pslli.q(<4 x i64> %a0, i32 3)
2174  ret <4 x i64> %res
2176declare <4 x i64> @llvm.x86.avx2.pslli.q(<4 x i64>, i32) nounwind readnone
2178define <4 x i64> @test_mm256_slli_si256(<4 x i64> %a0) {
2179; CHECK-LABEL: test_mm256_slli_si256:
2180; CHECK:       # %bb.0:
2181; CHECK-NEXT:    vpslldq {{.*#+}} ymm0 = zero,zero,zero,ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12],zero,zero,zero,ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28]
2182; CHECK-NEXT:    ret{{[l|q]}}
2183  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
2184  %shuf = shufflevector <32 x i8> zeroinitializer, <32 x i8> %arg0, <32 x i32> <i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60>
2185  %res = bitcast <32 x i8> %shuf to <4 x i64>
2186  ret <4 x i64> %res
2189define <2 x i64> @test_mm_sllv_epi32(<2 x i64> %a0, <2 x i64> %a1) {
2190; CHECK-LABEL: test_mm_sllv_epi32:
2191; CHECK:       # %bb.0:
2192; CHECK-NEXT:    vpsllvd %xmm1, %xmm0, %xmm0
2193; CHECK-NEXT:    ret{{[l|q]}}
2194  %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
2195  %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
2196  %res = call <4 x i32> @llvm.x86.avx2.psllv.d(<4 x i32> %arg0, <4 x i32> %arg1)
2197  %bc = bitcast <4 x i32> %res to <2 x i64>
2198  ret <2 x i64> %bc
2200declare <4 x i32> @llvm.x86.avx2.psllv.d(<4 x i32>, <4 x i32>) nounwind readnone
2202define <4 x i64> @test_mm256_sllv_epi32(<4 x i64> %a0, <4 x i64> %a1) {
2203; CHECK-LABEL: test_mm256_sllv_epi32:
2204; CHECK:       # %bb.0:
2205; CHECK-NEXT:    vpsllvd %ymm1, %ymm0, %ymm0
2206; CHECK-NEXT:    ret{{[l|q]}}
2207  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2208  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
2209  %res = call <8 x i32> @llvm.x86.avx2.psllv.d.256(<8 x i32> %arg0, <8 x i32> %arg1)
2210  %bc = bitcast <8 x i32> %res to <4 x i64>
2211  ret <4 x i64> %bc
2213declare <8 x i32> @llvm.x86.avx2.psllv.d.256(<8 x i32>, <8 x i32>) nounwind readnone
2215define <2 x i64> @test_mm_sllv_epi64(<2 x i64> %a0, <2 x i64> %a1) {
2216; CHECK-LABEL: test_mm_sllv_epi64:
2217; CHECK:       # %bb.0:
2218; CHECK-NEXT:    vpsllvq %xmm1, %xmm0, %xmm0
2219; CHECK-NEXT:    ret{{[l|q]}}
2220  %res = call <2 x i64> @llvm.x86.avx2.psllv.q(<2 x i64> %a0, <2 x i64> %a1)
2221  ret <2 x i64> %res
2223declare <2 x i64> @llvm.x86.avx2.psllv.q(<2 x i64>, <2 x i64>) nounwind readnone
2225define <4 x i64> @test_mm256_sllv_epi64(<4 x i64> %a0, <4 x i64> %a1) {
2226; CHECK-LABEL: test_mm256_sllv_epi64:
2227; CHECK:       # %bb.0:
2228; CHECK-NEXT:    vpsllvq %ymm1, %ymm0, %ymm0
2229; CHECK-NEXT:    ret{{[l|q]}}
2230  %res = call <4 x i64> @llvm.x86.avx2.psllv.q.256(<4 x i64> %a0, <4 x i64> %a1)
2231  ret <4 x i64> %res
2233declare <4 x i64> @llvm.x86.avx2.psllv.q.256(<4 x i64>, <4 x i64>) nounwind readnone
2235define <4 x i64> @test_mm256_sra_epi16(<4 x i64> %a0, <2 x i64> %a1) {
2236; CHECK-LABEL: test_mm256_sra_epi16:
2237; CHECK:       # %bb.0:
2238; CHECK-NEXT:    vpsraw %xmm1, %ymm0, %ymm0
2239; CHECK-NEXT:    ret{{[l|q]}}
2240  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2241  %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
2242  %res = call <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16> %arg0, <8 x i16> %arg1)
2243  %bc = bitcast <16 x i16> %res to <4 x i64>
2244  ret <4 x i64> %bc
2246declare <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16>, <8 x i16>) nounwind readnone
2248define <4 x i64> @test_mm256_sra_epi32(<4 x i64> %a0, <2 x i64> %a1) {
2249; CHECK-LABEL: test_mm256_sra_epi32:
2250; CHECK:       # %bb.0:
2251; CHECK-NEXT:    vpsrad %xmm1, %ymm0, %ymm0
2252; CHECK-NEXT:    ret{{[l|q]}}
2253  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2254  %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
2255  %res = call <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32> %arg0, <4 x i32> %arg1)
2256  %bc = bitcast <8 x i32> %res to <4 x i64>
2257  ret <4 x i64> %bc
2259declare <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32>, <4 x i32>) nounwind readnone
2261define <4 x i64> @test_mm256_srai_epi16(<4 x i64> %a0) {
2262; CHECK-LABEL: test_mm256_srai_epi16:
2263; CHECK:       # %bb.0:
2264; CHECK-NEXT:    vpsraw $3, %ymm0, %ymm0
2265; CHECK-NEXT:    ret{{[l|q]}}
2266  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2267  %res = call <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16> %arg0, i32 3)
2268  %bc = bitcast <16 x i16> %res to <4 x i64>
2269  ret <4 x i64> %bc
2271declare <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16>, i32) nounwind readnone
2273define <4 x i64> @test_mm256_srai_epi32(<4 x i64> %a0) {
2274; CHECK-LABEL: test_mm256_srai_epi32:
2275; CHECK:       # %bb.0:
2276; CHECK-NEXT:    vpsrad $3, %ymm0, %ymm0
2277; CHECK-NEXT:    ret{{[l|q]}}
2278  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2279  %res = call <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32> %arg0, i32 3)
2280  %bc = bitcast <8 x i32> %res to <4 x i64>
2281  ret <4 x i64> %bc
2283declare <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32>, i32) nounwind readnone
2285define <2 x i64> @test_mm_srav_epi32(<2 x i64> %a0, <2 x i64> %a1) {
2286; CHECK-LABEL: test_mm_srav_epi32:
2287; CHECK:       # %bb.0:
2288; CHECK-NEXT:    vpsravd %xmm1, %xmm0, %xmm0
2289; CHECK-NEXT:    ret{{[l|q]}}
2290  %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
2291  %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
2292  %res = call <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32> %arg0, <4 x i32> %arg1)
2293  %bc = bitcast <4 x i32> %res to <2 x i64>
2294  ret <2 x i64> %bc
2296declare <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32>, <4 x i32>) nounwind readnone
2298define <4 x i64> @test_mm256_srav_epi32(<4 x i64> %a0, <4 x i64> %a1) {
2299; CHECK-LABEL: test_mm256_srav_epi32:
2300; CHECK:       # %bb.0:
2301; CHECK-NEXT:    vpsravd %ymm1, %ymm0, %ymm0
2302; CHECK-NEXT:    ret{{[l|q]}}
2303  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2304  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
2305  %res = call <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32> %arg0, <8 x i32> %arg1)
2306  %bc = bitcast <8 x i32> %res to <4 x i64>
2307  ret <4 x i64> %bc
2309declare <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32>, <8 x i32>) nounwind readnone
2311define <4 x i64> @test_mm256_srl_epi16(<4 x i64> %a0, <2 x i64> %a1) {
2312; CHECK-LABEL: test_mm256_srl_epi16:
2313; CHECK:       # %bb.0:
2314; CHECK-NEXT:    vpsrlw %xmm1, %ymm0, %ymm0
2315; CHECK-NEXT:    ret{{[l|q]}}
2316  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2317  %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
2318  %res = call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> %arg0, <8 x i16> %arg1)
2319  %bc = bitcast <16 x i16> %res to <4 x i64>
2320  ret <4 x i64> %bc
2322declare <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16>, <8 x i16>) nounwind readnone
2324define <4 x i64> @test_mm256_srl_epi32(<4 x i64> %a0, <2 x i64> %a1) {
2325; CHECK-LABEL: test_mm256_srl_epi32:
2326; CHECK:       # %bb.0:
2327; CHECK-NEXT:    vpsrld %xmm1, %ymm0, %ymm0
2328; CHECK-NEXT:    ret{{[l|q]}}
2329  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2330  %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
2331  %res = call <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32> %arg0, <4 x i32> %arg1)
2332  %bc = bitcast <8 x i32> %res to <4 x i64>
2333  ret <4 x i64> %bc
2335declare <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32>, <4 x i32>) nounwind readnone
2337define <4 x i64> @test_mm256_srl_epi64(<4 x i64> %a0, <2 x i64> %a1) {
2338; CHECK-LABEL: test_mm256_srl_epi64:
2339; CHECK:       # %bb.0:
2340; CHECK-NEXT:    vpsrlq %xmm1, %ymm0, %ymm0
2341; CHECK-NEXT:    ret{{[l|q]}}
2342  %res = call <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64> %a0, <2 x i64> %a1)
2343  ret <4 x i64> %res
2345declare <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64>, <2 x i64>) nounwind readnone
2347define <4 x i64> @test_mm256_srli_epi16(<4 x i64> %a0) {
2348; CHECK-LABEL: test_mm256_srli_epi16:
2349; CHECK:       # %bb.0:
2350; CHECK-NEXT:    vpsrlw $3, %ymm0, %ymm0
2351; CHECK-NEXT:    ret{{[l|q]}}
2352  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2353  %res = call <16 x i16> @llvm.x86.avx2.psrli.w(<16 x i16> %arg0, i32 3)
2354  %bc = bitcast <16 x i16> %res to <4 x i64>
2355  ret <4 x i64> %bc
2357declare <16 x i16> @llvm.x86.avx2.psrli.w(<16 x i16>, i32) nounwind readnone
2359define <4 x i64> @test_mm256_srli_epi32(<4 x i64> %a0) {
2360; CHECK-LABEL: test_mm256_srli_epi32:
2361; CHECK:       # %bb.0:
2362; CHECK-NEXT:    vpsrld $3, %ymm0, %ymm0
2363; CHECK-NEXT:    ret{{[l|q]}}
2364  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2365  %res = call <8 x i32> @llvm.x86.avx2.psrli.d(<8 x i32> %arg0, i32 3)
2366  %bc = bitcast <8 x i32> %res to <4 x i64>
2367  ret <4 x i64> %bc
2369declare <8 x i32> @llvm.x86.avx2.psrli.d(<8 x i32>, i32) nounwind readnone
2371define <4 x i64> @test_mm256_srli_epi64(<4 x i64> %a0) {
2372; CHECK-LABEL: test_mm256_srli_epi64:
2373; CHECK:       # %bb.0:
2374; CHECK-NEXT:    vpsrlq $3, %ymm0, %ymm0
2375; CHECK-NEXT:    ret{{[l|q]}}
2376  %res = call <4 x i64> @llvm.x86.avx2.psrli.q(<4 x i64> %a0, i32 3)
2377  ret <4 x i64> %res
2379declare <4 x i64> @llvm.x86.avx2.psrli.q(<4 x i64>, i32) nounwind readnone
2381define <4 x i64> @test_mm256_srli_si256(<4 x i64> %a0) {
2382; CHECK-LABEL: test_mm256_srli_si256:
2383; CHECK:       # %bb.0:
2384; CHECK-NEXT:    vpsrldq {{.*#+}} ymm0 = ymm0[3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,ymm0[19,20,21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero
2385; CHECK-NEXT:    ret{{[l|q]}}
2386  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
2387  %shuf = shufflevector <32 x i8> %arg0, <32 x i8> zeroinitializer, <32 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50>
2388  %res = bitcast <32 x i8> %shuf to <4 x i64>
2389  ret <4 x i64> %res
2392define <2 x i64> @test_mm_srlv_epi32(<2 x i64> %a0, <2 x i64> %a1) {
2393; CHECK-LABEL: test_mm_srlv_epi32:
2394; CHECK:       # %bb.0:
2395; CHECK-NEXT:    vpsrlvd %xmm1, %xmm0, %xmm0
2396; CHECK-NEXT:    ret{{[l|q]}}
2397  %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
2398  %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
2399  %res = call <4 x i32> @llvm.x86.avx2.psrlv.d(<4 x i32> %arg0, <4 x i32> %arg1)
2400  %bc = bitcast <4 x i32> %res to <2 x i64>
2401  ret <2 x i64> %bc
2403declare <4 x i32> @llvm.x86.avx2.psrlv.d(<4 x i32>, <4 x i32>) nounwind readnone
2405define <4 x i64> @test_mm256_srlv_epi32(<4 x i64> %a0, <4 x i64> %a1) {
2406; CHECK-LABEL: test_mm256_srlv_epi32:
2407; CHECK:       # %bb.0:
2408; CHECK-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm0
2409; CHECK-NEXT:    ret{{[l|q]}}
2410  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2411  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
2412  %res = call <8 x i32> @llvm.x86.avx2.psrlv.d.256(<8 x i32> %arg0, <8 x i32> %arg1)
2413  %bc = bitcast <8 x i32> %res to <4 x i64>
2414  ret <4 x i64> %bc
2416declare <8 x i32> @llvm.x86.avx2.psrlv.d.256(<8 x i32>, <8 x i32>) nounwind readnone
2418define <2 x i64> @test_mm_srlv_epi64(<2 x i64> %a0, <2 x i64> %a1) {
2419; CHECK-LABEL: test_mm_srlv_epi64:
2420; CHECK:       # %bb.0:
2421; CHECK-NEXT:    vpsrlvq %xmm1, %xmm0, %xmm0
2422; CHECK-NEXT:    ret{{[l|q]}}
2423  %res = call <2 x i64> @llvm.x86.avx2.psrlv.q(<2 x i64> %a0, <2 x i64> %a1)
2424  ret <2 x i64> %res
2426declare <2 x i64> @llvm.x86.avx2.psrlv.q(<2 x i64>, <2 x i64>) nounwind readnone
2428define <4 x i64> @test_mm256_srlv_epi64(<4 x i64> %a0, <4 x i64> %a1) {
2429; CHECK-LABEL: test_mm256_srlv_epi64:
2430; CHECK:       # %bb.0:
2431; CHECK-NEXT:    vpsrlvq %ymm1, %ymm0, %ymm0
2432; CHECK-NEXT:    ret{{[l|q]}}
2433  %res = call <4 x i64> @llvm.x86.avx2.psrlv.q.256(<4 x i64> %a0, <4 x i64> %a1)
2434  ret <4 x i64> %res
2436declare <4 x i64> @llvm.x86.avx2.psrlv.q.256(<4 x i64>, <4 x i64>) nounwind readnone
2438define <4 x i64> @test_mm256_stream_load_si256(<4 x i64> *%a0) {
2439; X86-LABEL: test_mm256_stream_load_si256:
2440; X86:       # %bb.0:
2441; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
2442; X86-NEXT:    vmovntdqa (%eax), %ymm0
2443; X86-NEXT:    retl
2445; X64-LABEL: test_mm256_stream_load_si256:
2446; X64:       # %bb.0:
2447; X64-NEXT:    vmovntdqa (%rdi), %ymm0
2448; X64-NEXT:    retq
2449  %arg0 = bitcast <4 x i64> *%a0 to i8*
2450  %res = call <4 x i64> @llvm.x86.avx2.movntdqa(i8* %arg0)
2451  ret <4 x i64> %res
2453declare <4 x i64> @llvm.x86.avx2.movntdqa(i8*) nounwind readonly
2455define <4 x i64> @test_mm256_sub_epi8(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2456; CHECK-LABEL: test_mm256_sub_epi8:
2457; CHECK:       # %bb.0:
2458; CHECK-NEXT:    vpsubb %ymm1, %ymm0, %ymm0
2459; CHECK-NEXT:    ret{{[l|q]}}
2460  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
2461  %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
2462  %res = sub <32 x i8> %arg0, %arg1
2463  %bc = bitcast <32 x i8> %res to <4 x i64>
2464  ret <4 x i64> %bc
2467define <4 x i64> @test_mm256_sub_epi16(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2468; CHECK-LABEL: test_mm256_sub_epi16:
2469; CHECK:       # %bb.0:
2470; CHECK-NEXT:    vpsubw %ymm1, %ymm0, %ymm0
2471; CHECK-NEXT:    ret{{[l|q]}}
2472  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2473  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
2474  %res = sub <16 x i16> %arg0, %arg1
2475  %bc = bitcast <16 x i16> %res to <4 x i64>
2476  ret <4 x i64> %bc
2479define <4 x i64> @test_mm256_sub_epi32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2480; CHECK-LABEL: test_mm256_sub_epi32:
2481; CHECK:       # %bb.0:
2482; CHECK-NEXT:    vpsubd %ymm1, %ymm0, %ymm0
2483; CHECK-NEXT:    ret{{[l|q]}}
2484  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2485  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
2486  %res = sub <8 x i32> %arg0, %arg1
2487  %bc = bitcast <8 x i32> %res to <4 x i64>
2488  ret <4 x i64> %bc
2491define <4 x i64> @test_mm256_sub_epi64(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2492; CHECK-LABEL: test_mm256_sub_epi64:
2493; CHECK:       # %bb.0:
2494; CHECK-NEXT:    vpsubq %ymm1, %ymm0, %ymm0
2495; CHECK-NEXT:    ret{{[l|q]}}
2496  %res = sub <4 x i64> %a0, %a1
2497  ret <4 x i64> %res
2500define <4 x i64> @test_mm256_subs_epi8(<4 x i64> %a0, <4 x i64> %a1) {
2501; CHECK-LABEL: test_mm256_subs_epi8:
2502; CHECK:       # %bb.0:
2503; CHECK-NEXT:    vpsubsb %ymm1, %ymm0, %ymm0
2504; CHECK-NEXT:    ret{{[l|q]}}
2505  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
2506  %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
2507  %res = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> %arg0, <32 x i8> %arg1)
2508  %bc = bitcast <32 x i8> %res to <4 x i64>
2509  ret <4 x i64> %bc
2511declare <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8>, <32 x i8>) nounwind readnone
2513define <4 x i64> @test_mm256_subs_epi16(<4 x i64> %a0, <4 x i64> %a1) {
2514; CHECK-LABEL: test_mm256_subs_epi16:
2515; CHECK:       # %bb.0:
2516; CHECK-NEXT:    vpsubsw %ymm1, %ymm0, %ymm0
2517; CHECK-NEXT:    ret{{[l|q]}}
2518  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2519  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
2520  %res = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> %arg0, <16 x i16> %arg1)
2521  %bc = bitcast <16 x i16> %res to <4 x i64>
2522  ret <4 x i64> %bc
2524declare <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16>, <16 x i16>) nounwind readnone
2526define <4 x i64> @test_mm256_subs_epu8(<4 x i64> %a0, <4 x i64> %a1) {
2527; CHECK-LABEL: test_mm256_subs_epu8:
2528; CHECK:       # %bb.0:
2529; CHECK-NEXT:    vpsubusb %ymm1, %ymm0, %ymm0
2530; CHECK-NEXT:    ret{{[l|q]}}
2531  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
2532  %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
2533  %res = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> %arg0, <32 x i8> %arg1)
2534  %bc = bitcast <32 x i8> %res to <4 x i64>
2535  ret <4 x i64> %bc
2537declare <32 x i8> @llvm.usub.sat.v32i8(<32 x i8>, <32 x i8>)
2539define <4 x i64> @test_mm256_subs_epu16(<4 x i64> %a0, <4 x i64> %a1) {
2540; CHECK-LABEL: test_mm256_subs_epu16:
2541; CHECK:       # %bb.0:
2542; CHECK-NEXT:    vpsubusw %ymm1, %ymm0, %ymm0
2543; CHECK-NEXT:    ret{{[l|q]}}
2544  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2545  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
2546  %res = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> %arg0, <16 x i16> %arg1)
2547  %bc = bitcast <16 x i16> %res to <4 x i64>
2548  ret <4 x i64> %bc
2550declare <16 x i16> @llvm.usub.sat.v16i16(<16 x i16>, <16 x i16>)
2552define <4 x i64> @test_mm256_unpackhi_epi8(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2553; CHECK-LABEL: test_mm256_unpackhi_epi8:
2554; CHECK:       # %bb.0:
2555; CHECK-NEXT:    vpunpckhbw {{.*#+}} ymm0 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
2556; CHECK-NEXT:    ret{{[l|q]}}
2557  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
2558  %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
2559  %res = shufflevector <32 x i8> %arg0, <32 x i8> %arg1, <32 x i32> <i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63>
2560  %bc = bitcast <32 x i8> %res to <4 x i64>
2561  ret <4 x i64> %bc
2564define <4 x i64> @test_mm256_unpackhi_epi16(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2565; CHECK-LABEL: test_mm256_unpackhi_epi16:
2566; CHECK:       # %bb.0:
2567; CHECK-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15]
2568; CHECK-NEXT:    ret{{[l|q]}}
2569  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2570  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
2571  %res = shufflevector <16 x i16> %arg0, <16 x i16> %arg1, <16 x i32> <i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
2572  %bc = bitcast <16 x i16> %res to <4 x i64>
2573  ret <4 x i64> %bc
2576define <4 x i64> @test_mm256_unpackhi_epi32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2577; CHECK-LABEL: test_mm256_unpackhi_epi32:
2578; CHECK:       # %bb.0:
2579; CHECK-NEXT:    vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
2580; CHECK-NEXT:    ret{{[l|q]}}
2581  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2582  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
2583  %res = shufflevector <8 x i32> %arg0, <8 x i32> %arg1, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
2584  %bc = bitcast <8 x i32> %res to <4 x i64>
2585  ret <4 x i64> %bc
2588define <4 x i64> @test_mm256_unpackhi_epi64(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2589; CHECK-LABEL: test_mm256_unpackhi_epi64:
2590; CHECK:       # %bb.0:
2591; CHECK-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
2592; CHECK-NEXT:    ret{{[l|q]}}
2593  %res = shufflevector <4 x i64> %a0, <4 x i64> %a1, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
2594  ret <4 x i64> %res
2597define <4 x i64> @test_mm256_unpacklo_epi8(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2598; CHECK-LABEL: test_mm256_unpacklo_epi8:
2599; CHECK:       # %bb.0:
2600; CHECK-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
2601; CHECK-NEXT:    ret{{[l|q]}}
2602  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
2603  %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
2604  %res = shufflevector <32 x i8> %arg0, <32 x i8> %arg1, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55>
2605  %bc = bitcast <32 x i8> %res to <4 x i64>
2606  ret <4 x i64> %bc
2609define <4 x i64> @test_mm256_unpacklo_epi16(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2610; CHECK-LABEL: test_mm256_unpacklo_epi16:
2611; CHECK:       # %bb.0:
2612; CHECK-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11]
2613; CHECK-NEXT:    ret{{[l|q]}}
2614  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2615  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
2616  %res = shufflevector <16 x i16> %arg0, <16 x i16> %arg1, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27>
2617  %bc = bitcast <16 x i16> %res to <4 x i64>
2618  ret <4 x i64> %bc
2621define <4 x i64> @test_mm256_unpacklo_epi32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2622; CHECK-LABEL: test_mm256_unpacklo_epi32:
2623; CHECK:       # %bb.0:
2624; CHECK-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
2625; CHECK-NEXT:    ret{{[l|q]}}
2626  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2627  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
2628  %res = shufflevector <8 x i32> %arg0, <8 x i32> %arg1, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
2629  %bc = bitcast <8 x i32> %res to <4 x i64>
2630  ret <4 x i64> %bc
2633define <4 x i64> @test_mm256_unpacklo_epi64(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2634; CHECK-LABEL: test_mm256_unpacklo_epi64:
2635; CHECK:       # %bb.0:
2636; CHECK-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
2637; CHECK-NEXT:    ret{{[l|q]}}
2638  %res = shufflevector <4 x i64> %a0, <4 x i64> %a1, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
2639  ret <4 x i64> %res
2642define <4 x i64> @test_mm256_xor_si256(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2643; CHECK-LABEL: test_mm256_xor_si256:
2644; CHECK:       # %bb.0:
2645; CHECK-NEXT:    vxorps %ymm1, %ymm0, %ymm0
2646; CHECK-NEXT:    ret{{[l|q]}}
2647  %res = xor <4 x i64> %a0, %a1
2648  ret <4 x i64> %res
2651declare <8 x float> @llvm.x86.avx.cmp.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone
2653declare <4 x double> @llvm.x86.avx.cmp.pd.256(<4 x double>, <4 x double>, i8) nounwind readnone