1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=CHECK,X86
3; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=CHECK,X64
4
5; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/avx512vl-builtins.c
6
7define <4 x float> @test_mm_mask_cvtepi32_ps(<4 x float> %__W, i8 zeroext %__U, <2 x i64> %__A) {
8; X86-LABEL: test_mm_mask_cvtepi32_ps:
9; X86:       # %bb.0: # %entry
10; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
11; X86-NEXT:    kmovw %eax, %k1
12; X86-NEXT:    vcvtdq2ps %xmm1, %xmm0 {%k1}
13; X86-NEXT:    retl
14;
15; X64-LABEL: test_mm_mask_cvtepi32_ps:
16; X64:       # %bb.0: # %entry
17; X64-NEXT:    kmovw %edi, %k1
18; X64-NEXT:    vcvtdq2ps %xmm1, %xmm0 {%k1}
19; X64-NEXT:    retq
20entry:
21  %0 = bitcast <2 x i64> %__A to <4 x i32>
22  %conv.i.i = sitofp <4 x i32> %0 to <4 x float>
23  %1 = bitcast i8 %__U to <8 x i1>
24  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
25  %2 = select <4 x i1> %extract.i, <4 x float> %conv.i.i, <4 x float> %__W
26  ret <4 x float> %2
27}
28
29define <4 x float> @test_mm_maskz_cvtepi32_ps(i8 zeroext %__U, <2 x i64> %__A) {
30; X86-LABEL: test_mm_maskz_cvtepi32_ps:
31; X86:       # %bb.0: # %entry
32; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
33; X86-NEXT:    kmovw %eax, %k1
34; X86-NEXT:    vcvtdq2ps %xmm0, %xmm0 {%k1} {z}
35; X86-NEXT:    retl
36;
37; X64-LABEL: test_mm_maskz_cvtepi32_ps:
38; X64:       # %bb.0: # %entry
39; X64-NEXT:    kmovw %edi, %k1
40; X64-NEXT:    vcvtdq2ps %xmm0, %xmm0 {%k1} {z}
41; X64-NEXT:    retq
42entry:
43  %0 = bitcast <2 x i64> %__A to <4 x i32>
44  %conv.i.i = sitofp <4 x i32> %0 to <4 x float>
45  %1 = bitcast i8 %__U to <8 x i1>
46  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
47  %2 = select <4 x i1> %extract.i, <4 x float> %conv.i.i, <4 x float> zeroinitializer
48  ret <4 x float> %2
49}
50
51define <8 x float> @test_mm256_mask_cvtepi32_ps(<8 x float> %__W, i8 zeroext %__U, <4 x i64> %__A) {
52; X86-LABEL: test_mm256_mask_cvtepi32_ps:
53; X86:       # %bb.0: # %entry
54; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
55; X86-NEXT:    kmovw %eax, %k1
56; X86-NEXT:    vcvtdq2ps %ymm1, %ymm0 {%k1}
57; X86-NEXT:    retl
58;
59; X64-LABEL: test_mm256_mask_cvtepi32_ps:
60; X64:       # %bb.0: # %entry
61; X64-NEXT:    kmovw %edi, %k1
62; X64-NEXT:    vcvtdq2ps %ymm1, %ymm0 {%k1}
63; X64-NEXT:    retq
64entry:
65  %0 = bitcast <4 x i64> %__A to <8 x i32>
66  %conv.i.i = sitofp <8 x i32> %0 to <8 x float>
67  %1 = bitcast i8 %__U to <8 x i1>
68  %2 = select <8 x i1> %1, <8 x float> %conv.i.i, <8 x float> %__W
69  ret <8 x float> %2
70}
71
72define <8 x float> @test_mm256_maskz_cvtepi32_ps(i8 zeroext %__U, <4 x i64> %__A) {
73; X86-LABEL: test_mm256_maskz_cvtepi32_ps:
74; X86:       # %bb.0: # %entry
75; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
76; X86-NEXT:    kmovw %eax, %k1
77; X86-NEXT:    vcvtdq2ps %ymm0, %ymm0 {%k1} {z}
78; X86-NEXT:    retl
79;
80; X64-LABEL: test_mm256_maskz_cvtepi32_ps:
81; X64:       # %bb.0: # %entry
82; X64-NEXT:    kmovw %edi, %k1
83; X64-NEXT:    vcvtdq2ps %ymm0, %ymm0 {%k1} {z}
84; X64-NEXT:    retq
85entry:
86  %0 = bitcast <4 x i64> %__A to <8 x i32>
87  %conv.i.i = sitofp <8 x i32> %0 to <8 x float>
88  %1 = bitcast i8 %__U to <8 x i1>
89  %2 = select <8 x i1> %1, <8 x float> %conv.i.i, <8 x float> zeroinitializer
90  ret <8 x float> %2
91}
92
93define <2 x i64> @test_mm_mask_cvtpd_epi32(<2 x i64> %__W, i8 zeroext %__U, <2 x double> %__A) {
94; X86-LABEL: test_mm_mask_cvtpd_epi32:
95; X86:       # %bb.0: # %entry
96; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
97; X86-NEXT:    kmovw %eax, %k1
98; X86-NEXT:    vcvtpd2dq %xmm1, %xmm0 {%k1}
99; X86-NEXT:    retl
100;
101; X64-LABEL: test_mm_mask_cvtpd_epi32:
102; X64:       # %bb.0: # %entry
103; X64-NEXT:    kmovw %edi, %k1
104; X64-NEXT:    vcvtpd2dq %xmm1, %xmm0 {%k1}
105; X64-NEXT:    retq
106entry:
107  %0 = bitcast <2 x i64> %__W to <4 x i32>
108  %1 = tail call <4 x i32> @llvm.x86.avx512.mask.cvtpd2dq.128(<2 x double> %__A, <4 x i32> %0, i8 %__U) #8
109  %2 = bitcast <4 x i32> %1 to <2 x i64>
110  ret <2 x i64> %2
111}
112
113define <2 x i64> @test_mm_maskz_cvtpd_epi32(i8 zeroext %__U, <2 x double> %__A) {
114; X86-LABEL: test_mm_maskz_cvtpd_epi32:
115; X86:       # %bb.0: # %entry
116; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
117; X86-NEXT:    kmovw %eax, %k1
118; X86-NEXT:    vcvtpd2dq %xmm0, %xmm0 {%k1} {z}
119; X86-NEXT:    retl
120;
121; X64-LABEL: test_mm_maskz_cvtpd_epi32:
122; X64:       # %bb.0: # %entry
123; X64-NEXT:    kmovw %edi, %k1
124; X64-NEXT:    vcvtpd2dq %xmm0, %xmm0 {%k1} {z}
125; X64-NEXT:    retq
126entry:
127  %0 = tail call <4 x i32> @llvm.x86.avx512.mask.cvtpd2dq.128(<2 x double> %__A, <4 x i32> zeroinitializer, i8 %__U) #8
128  %1 = bitcast <4 x i32> %0 to <2 x i64>
129  ret <2 x i64> %1
130}
131
132define <2 x i64> @test_mm256_mask_cvtpd_epi32(<2 x i64> %__W, i8 zeroext %__U, <4 x double> %__A) {
133; X86-LABEL: test_mm256_mask_cvtpd_epi32:
134; X86:       # %bb.0: # %entry
135; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
136; X86-NEXT:    kmovw %eax, %k1
137; X86-NEXT:    vcvtpd2dq %ymm1, %xmm0 {%k1}
138; X86-NEXT:    vzeroupper
139; X86-NEXT:    retl
140;
141; X64-LABEL: test_mm256_mask_cvtpd_epi32:
142; X64:       # %bb.0: # %entry
143; X64-NEXT:    kmovw %edi, %k1
144; X64-NEXT:    vcvtpd2dq %ymm1, %xmm0 {%k1}
145; X64-NEXT:    vzeroupper
146; X64-NEXT:    retq
147entry:
148  %0 = tail call <4 x i32> @llvm.x86.avx.cvt.pd2dq.256(<4 x double> %__A) #8
149  %1 = bitcast <2 x i64> %__W to <4 x i32>
150  %2 = bitcast i8 %__U to <8 x i1>
151  %extract.i = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
152  %3 = select <4 x i1> %extract.i, <4 x i32> %0, <4 x i32> %1
153  %4 = bitcast <4 x i32> %3 to <2 x i64>
154  ret <2 x i64> %4
155}
156
157define <2 x i64> @test_mm256_maskz_cvtpd_epi32(i8 zeroext %__U, <4 x double> %__A) {
158; X86-LABEL: test_mm256_maskz_cvtpd_epi32:
159; X86:       # %bb.0: # %entry
160; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
161; X86-NEXT:    kmovw %eax, %k1
162; X86-NEXT:    vcvtpd2dq %ymm0, %xmm0 {%k1} {z}
163; X86-NEXT:    vzeroupper
164; X86-NEXT:    retl
165;
166; X64-LABEL: test_mm256_maskz_cvtpd_epi32:
167; X64:       # %bb.0: # %entry
168; X64-NEXT:    kmovw %edi, %k1
169; X64-NEXT:    vcvtpd2dq %ymm0, %xmm0 {%k1} {z}
170; X64-NEXT:    vzeroupper
171; X64-NEXT:    retq
172entry:
173  %0 = tail call <4 x i32> @llvm.x86.avx.cvt.pd2dq.256(<4 x double> %__A) #8
174  %1 = bitcast i8 %__U to <8 x i1>
175  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
176  %2 = select <4 x i1> %extract.i, <4 x i32> %0, <4 x i32> zeroinitializer
177  %3 = bitcast <4 x i32> %2 to <2 x i64>
178  ret <2 x i64> %3
179}
180
181define <4 x float> @test_mm_mask_cvtpd_ps(<4 x float> %__W, i8 zeroext %__U, <2 x double> %__A) {
182; X86-LABEL: test_mm_mask_cvtpd_ps:
183; X86:       # %bb.0: # %entry
184; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
185; X86-NEXT:    kmovw %eax, %k1
186; X86-NEXT:    vcvtpd2ps %xmm1, %xmm0 {%k1}
187; X86-NEXT:    retl
188;
189; X64-LABEL: test_mm_mask_cvtpd_ps:
190; X64:       # %bb.0: # %entry
191; X64-NEXT:    kmovw %edi, %k1
192; X64-NEXT:    vcvtpd2ps %xmm1, %xmm0 {%k1}
193; X64-NEXT:    retq
194entry:
195  %0 = tail call <4 x float> @llvm.x86.avx512.mask.cvtpd2ps(<2 x double> %__A, <4 x float> %__W, i8 %__U) #8
196  ret <4 x float> %0
197}
198
199define <4 x float> @test_mm_maskz_cvtpd_ps(i8 zeroext %__U, <2 x double> %__A) {
200; X86-LABEL: test_mm_maskz_cvtpd_ps:
201; X86:       # %bb.0: # %entry
202; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
203; X86-NEXT:    kmovw %eax, %k1
204; X86-NEXT:    vcvtpd2ps %xmm0, %xmm0 {%k1} {z}
205; X86-NEXT:    retl
206;
207; X64-LABEL: test_mm_maskz_cvtpd_ps:
208; X64:       # %bb.0: # %entry
209; X64-NEXT:    kmovw %edi, %k1
210; X64-NEXT:    vcvtpd2ps %xmm0, %xmm0 {%k1} {z}
211; X64-NEXT:    retq
212entry:
213  %0 = tail call <4 x float> @llvm.x86.avx512.mask.cvtpd2ps(<2 x double> %__A, <4 x float> zeroinitializer, i8 %__U) #8
214  ret <4 x float> %0
215}
216
217define <4 x float> @test_mm256_mask_cvtpd_ps(<4 x float> %__W, i8 zeroext %__U, <4 x double> %__A) {
218; X86-LABEL: test_mm256_mask_cvtpd_ps:
219; X86:       # %bb.0: # %entry
220; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
221; X86-NEXT:    kmovw %eax, %k1
222; X86-NEXT:    vcvtpd2ps %ymm1, %xmm0 {%k1}
223; X86-NEXT:    vzeroupper
224; X86-NEXT:    retl
225;
226; X64-LABEL: test_mm256_mask_cvtpd_ps:
227; X64:       # %bb.0: # %entry
228; X64-NEXT:    kmovw %edi, %k1
229; X64-NEXT:    vcvtpd2ps %ymm1, %xmm0 {%k1}
230; X64-NEXT:    vzeroupper
231; X64-NEXT:    retq
232entry:
233  %0 = tail call <4 x float> @llvm.x86.avx.cvt.pd2.ps.256(<4 x double> %__A) #8
234  %1 = bitcast i8 %__U to <8 x i1>
235  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
236  %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> %__W
237  ret <4 x float> %2
238}
239
240define <4 x float> @test_mm256_maskz_cvtpd_ps(i8 zeroext %__U, <4 x double> %__A) {
241; X86-LABEL: test_mm256_maskz_cvtpd_ps:
242; X86:       # %bb.0: # %entry
243; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
244; X86-NEXT:    kmovw %eax, %k1
245; X86-NEXT:    vcvtpd2ps %ymm0, %xmm0 {%k1} {z}
246; X86-NEXT:    vzeroupper
247; X86-NEXT:    retl
248;
249; X64-LABEL: test_mm256_maskz_cvtpd_ps:
250; X64:       # %bb.0: # %entry
251; X64-NEXT:    kmovw %edi, %k1
252; X64-NEXT:    vcvtpd2ps %ymm0, %xmm0 {%k1} {z}
253; X64-NEXT:    vzeroupper
254; X64-NEXT:    retq
255entry:
256  %0 = tail call <4 x float> @llvm.x86.avx.cvt.pd2.ps.256(<4 x double> %__A) #8
257  %1 = bitcast i8 %__U to <8 x i1>
258  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
259  %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> zeroinitializer
260  ret <4 x float> %2
261}
262
263define <2 x i64> @test_mm_cvtpd_epu32(<2 x double> %__A) {
264; CHECK-LABEL: test_mm_cvtpd_epu32:
265; CHECK:       # %bb.0: # %entry
266; CHECK-NEXT:    vcvtpd2udq %xmm0, %xmm0
267; CHECK-NEXT:    ret{{[l|q]}}
268entry:
269  %0 = tail call <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.128(<2 x double> %__A, <4 x i32> zeroinitializer, i8 -1) #8
270  %1 = bitcast <4 x i32> %0 to <2 x i64>
271  ret <2 x i64> %1
272}
273
274define <2 x i64> @test_mm_mask_cvtpd_epu32(<2 x i64> %__W, i8 zeroext %__U, <2 x double> %__A) {
275; X86-LABEL: test_mm_mask_cvtpd_epu32:
276; X86:       # %bb.0: # %entry
277; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
278; X86-NEXT:    kmovw %eax, %k1
279; X86-NEXT:    vcvtpd2udq %xmm1, %xmm0 {%k1}
280; X86-NEXT:    retl
281;
282; X64-LABEL: test_mm_mask_cvtpd_epu32:
283; X64:       # %bb.0: # %entry
284; X64-NEXT:    kmovw %edi, %k1
285; X64-NEXT:    vcvtpd2udq %xmm1, %xmm0 {%k1}
286; X64-NEXT:    retq
287entry:
288  %0 = bitcast <2 x i64> %__W to <4 x i32>
289  %1 = tail call <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.128(<2 x double> %__A, <4 x i32> %0, i8 %__U) #8
290  %2 = bitcast <4 x i32> %1 to <2 x i64>
291  ret <2 x i64> %2
292}
293
294define <2 x i64> @test_mm_maskz_cvtpd_epu32(i8 zeroext %__U, <2 x double> %__A) {
295; X86-LABEL: test_mm_maskz_cvtpd_epu32:
296; X86:       # %bb.0: # %entry
297; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
298; X86-NEXT:    kmovw %eax, %k1
299; X86-NEXT:    vcvtpd2udq %xmm0, %xmm0 {%k1} {z}
300; X86-NEXT:    retl
301;
302; X64-LABEL: test_mm_maskz_cvtpd_epu32:
303; X64:       # %bb.0: # %entry
304; X64-NEXT:    kmovw %edi, %k1
305; X64-NEXT:    vcvtpd2udq %xmm0, %xmm0 {%k1} {z}
306; X64-NEXT:    retq
307entry:
308  %0 = tail call <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.128(<2 x double> %__A, <4 x i32> zeroinitializer, i8 %__U) #8
309  %1 = bitcast <4 x i32> %0 to <2 x i64>
310  ret <2 x i64> %1
311}
312
313define <2 x i64> @test_mm256_cvtpd_epu32(<4 x double> %__A) {
314; CHECK-LABEL: test_mm256_cvtpd_epu32:
315; CHECK:       # %bb.0: # %entry
316; CHECK-NEXT:    vcvtpd2udq %ymm0, %xmm0
317; CHECK-NEXT:    vzeroupper
318; CHECK-NEXT:    ret{{[l|q]}}
319entry:
320  %0 = tail call <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.256(<4 x double> %__A, <4 x i32> zeroinitializer, i8 -1) #8
321  %1 = bitcast <4 x i32> %0 to <2 x i64>
322  ret <2 x i64> %1
323}
324
325define <2 x i64> @test_mm256_mask_cvtpd_epu32(<2 x i64> %__W, i8 zeroext %__U, <4 x double> %__A) {
326; X86-LABEL: test_mm256_mask_cvtpd_epu32:
327; X86:       # %bb.0: # %entry
328; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
329; X86-NEXT:    kmovw %eax, %k1
330; X86-NEXT:    vcvtpd2udq %ymm1, %xmm0 {%k1}
331; X86-NEXT:    vzeroupper
332; X86-NEXT:    retl
333;
334; X64-LABEL: test_mm256_mask_cvtpd_epu32:
335; X64:       # %bb.0: # %entry
336; X64-NEXT:    kmovw %edi, %k1
337; X64-NEXT:    vcvtpd2udq %ymm1, %xmm0 {%k1}
338; X64-NEXT:    vzeroupper
339; X64-NEXT:    retq
340entry:
341  %0 = bitcast <2 x i64> %__W to <4 x i32>
342  %1 = tail call <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.256(<4 x double> %__A, <4 x i32> %0, i8 %__U) #8
343  %2 = bitcast <4 x i32> %1 to <2 x i64>
344  ret <2 x i64> %2
345}
346
347define <2 x i64> @test_mm256_maskz_cvtpd_epu32(i8 zeroext %__U, <4 x double> %__A) {
348; X86-LABEL: test_mm256_maskz_cvtpd_epu32:
349; X86:       # %bb.0: # %entry
350; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
351; X86-NEXT:    kmovw %eax, %k1
352; X86-NEXT:    vcvtpd2udq %ymm0, %xmm0 {%k1} {z}
353; X86-NEXT:    vzeroupper
354; X86-NEXT:    retl
355;
356; X64-LABEL: test_mm256_maskz_cvtpd_epu32:
357; X64:       # %bb.0: # %entry
358; X64-NEXT:    kmovw %edi, %k1
359; X64-NEXT:    vcvtpd2udq %ymm0, %xmm0 {%k1} {z}
360; X64-NEXT:    vzeroupper
361; X64-NEXT:    retq
362entry:
363  %0 = tail call <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.256(<4 x double> %__A, <4 x i32> zeroinitializer, i8 %__U) #8
364  %1 = bitcast <4 x i32> %0 to <2 x i64>
365  ret <2 x i64> %1
366}
367
368define <2 x i64> @test_mm_mask_cvtps_epi32(<2 x i64> %__W, i8 zeroext %__U, <4 x float> %__A) {
369; X86-LABEL: test_mm_mask_cvtps_epi32:
370; X86:       # %bb.0: # %entry
371; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
372; X86-NEXT:    kmovw %eax, %k1
373; X86-NEXT:    vcvtps2dq %xmm1, %xmm0 {%k1}
374; X86-NEXT:    retl
375;
376; X64-LABEL: test_mm_mask_cvtps_epi32:
377; X64:       # %bb.0: # %entry
378; X64-NEXT:    kmovw %edi, %k1
379; X64-NEXT:    vcvtps2dq %xmm1, %xmm0 {%k1}
380; X64-NEXT:    retq
381entry:
382  %0 = tail call <4 x i32> @llvm.x86.sse2.cvtps2dq(<4 x float> %__A) #8
383  %1 = bitcast <2 x i64> %__W to <4 x i32>
384  %2 = bitcast i8 %__U to <8 x i1>
385  %extract.i = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
386  %3 = select <4 x i1> %extract.i, <4 x i32> %0, <4 x i32> %1
387  %4 = bitcast <4 x i32> %3 to <2 x i64>
388  ret <2 x i64> %4
389}
390
391define <2 x i64> @test_mm_maskz_cvtps_epi32(i8 zeroext %__U, <4 x float> %__A) {
392; X86-LABEL: test_mm_maskz_cvtps_epi32:
393; X86:       # %bb.0: # %entry
394; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
395; X86-NEXT:    kmovw %eax, %k1
396; X86-NEXT:    vcvtps2dq %xmm0, %xmm0 {%k1} {z}
397; X86-NEXT:    retl
398;
399; X64-LABEL: test_mm_maskz_cvtps_epi32:
400; X64:       # %bb.0: # %entry
401; X64-NEXT:    kmovw %edi, %k1
402; X64-NEXT:    vcvtps2dq %xmm0, %xmm0 {%k1} {z}
403; X64-NEXT:    retq
404entry:
405  %0 = tail call <4 x i32> @llvm.x86.sse2.cvtps2dq(<4 x float> %__A) #8
406  %1 = bitcast i8 %__U to <8 x i1>
407  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
408  %2 = select <4 x i1> %extract.i, <4 x i32> %0, <4 x i32> zeroinitializer
409  %3 = bitcast <4 x i32> %2 to <2 x i64>
410  ret <2 x i64> %3
411}
412
413define <4 x i64> @test_mm256_mask_cvtps_epi32(<4 x i64> %__W, i8 zeroext %__U, <8 x float> %__A) {
414; X86-LABEL: test_mm256_mask_cvtps_epi32:
415; X86:       # %bb.0: # %entry
416; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
417; X86-NEXT:    kmovw %eax, %k1
418; X86-NEXT:    vcvtps2dq %ymm1, %ymm0 {%k1}
419; X86-NEXT:    retl
420;
421; X64-LABEL: test_mm256_mask_cvtps_epi32:
422; X64:       # %bb.0: # %entry
423; X64-NEXT:    kmovw %edi, %k1
424; X64-NEXT:    vcvtps2dq %ymm1, %ymm0 {%k1}
425; X64-NEXT:    retq
426entry:
427  %0 = tail call <8 x i32> @llvm.x86.avx.cvt.ps2dq.256(<8 x float> %__A) #8
428  %1 = bitcast <4 x i64> %__W to <8 x i32>
429  %2 = bitcast i8 %__U to <8 x i1>
430  %3 = select <8 x i1> %2, <8 x i32> %0, <8 x i32> %1
431  %4 = bitcast <8 x i32> %3 to <4 x i64>
432  ret <4 x i64> %4
433}
434
435define <4 x i64> @test_mm256_maskz_cvtps_epi32(i8 zeroext %__U, <8 x float> %__A) {
436; X86-LABEL: test_mm256_maskz_cvtps_epi32:
437; X86:       # %bb.0: # %entry
438; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
439; X86-NEXT:    kmovw %eax, %k1
440; X86-NEXT:    vcvtps2dq %ymm0, %ymm0 {%k1} {z}
441; X86-NEXT:    retl
442;
443; X64-LABEL: test_mm256_maskz_cvtps_epi32:
444; X64:       # %bb.0: # %entry
445; X64-NEXT:    kmovw %edi, %k1
446; X64-NEXT:    vcvtps2dq %ymm0, %ymm0 {%k1} {z}
447; X64-NEXT:    retq
448entry:
449  %0 = tail call <8 x i32> @llvm.x86.avx.cvt.ps2dq.256(<8 x float> %__A) #8
450  %1 = bitcast i8 %__U to <8 x i1>
451  %2 = select <8 x i1> %1, <8 x i32> %0, <8 x i32> zeroinitializer
452  %3 = bitcast <8 x i32> %2 to <4 x i64>
453  ret <4 x i64> %3
454}
455
456define <2 x double> @test_mm_mask_cvtps_pd(<2 x double> %__W, i8 zeroext %__U, <4 x float> %__A) local_unnamed_addr #0 {
457; X86-LABEL: test_mm_mask_cvtps_pd:
458; X86:       # %bb.0: # %entry
459; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
460; X86-NEXT:    kmovw %eax, %k1
461; X86-NEXT:    vcvtps2pd %xmm1, %xmm0 {%k1}
462; X86-NEXT:    retl
463;
464; X64-LABEL: test_mm_mask_cvtps_pd:
465; X64:       # %bb.0: # %entry
466; X64-NEXT:    kmovw %edi, %k1
467; X64-NEXT:    vcvtps2pd %xmm1, %xmm0 {%k1}
468; X64-NEXT:    retq
469entry:
470  %shuffle.i.i = shufflevector <4 x float> %__A, <4 x float> undef, <2 x i32> <i32 0, i32 1>
471  %conv.i.i = fpext <2 x float> %shuffle.i.i to <2 x double>
472  %0 = bitcast i8 %__U to <8 x i1>
473  %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
474  %1 = select <2 x i1> %extract.i, <2 x double> %conv.i.i, <2 x double> %__W
475  ret <2 x double> %1
476}
477
478define <2 x double> @test_mm_maskz_cvtps_pd(i8 zeroext %__U, <4 x float> %__A) local_unnamed_addr #0 {
479; X86-LABEL: test_mm_maskz_cvtps_pd:
480; X86:       # %bb.0: # %entry
481; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
482; X86-NEXT:    kmovw %eax, %k1
483; X86-NEXT:    vcvtps2pd %xmm0, %xmm0 {%k1} {z}
484; X86-NEXT:    retl
485;
486; X64-LABEL: test_mm_maskz_cvtps_pd:
487; X64:       # %bb.0: # %entry
488; X64-NEXT:    kmovw %edi, %k1
489; X64-NEXT:    vcvtps2pd %xmm0, %xmm0 {%k1} {z}
490; X64-NEXT:    retq
491entry:
492  %shuffle.i.i = shufflevector <4 x float> %__A, <4 x float> undef, <2 x i32> <i32 0, i32 1>
493  %conv.i.i = fpext <2 x float> %shuffle.i.i to <2 x double>
494  %0 = bitcast i8 %__U to <8 x i1>
495  %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
496  %1 = select <2 x i1> %extract.i, <2 x double> %conv.i.i, <2 x double> zeroinitializer
497  ret <2 x double> %1
498}
499
500define <4 x double> @test_mm256_mask_cvtps_pd(<4 x double> %__W, i8 zeroext %__U, <4 x float> %__A) local_unnamed_addr #0 {
501; X86-LABEL: test_mm256_mask_cvtps_pd:
502; X86:       # %bb.0: # %entry
503; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
504; X86-NEXT:    kmovw %eax, %k1
505; X86-NEXT:    vcvtps2pd %xmm1, %ymm0 {%k1}
506; X86-NEXT:    retl
507;
508; X64-LABEL: test_mm256_mask_cvtps_pd:
509; X64:       # %bb.0: # %entry
510; X64-NEXT:    kmovw %edi, %k1
511; X64-NEXT:    vcvtps2pd %xmm1, %ymm0 {%k1}
512; X64-NEXT:    retq
513entry:
514  %conv.i.i = fpext <4 x float> %__A to <4 x double>
515  %0 = bitcast i8 %__U to <8 x i1>
516  %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
517  %1 = select <4 x i1> %extract.i, <4 x double> %conv.i.i, <4 x double> %__W
518  ret <4 x double> %1
519}
520
521define <4 x double> @test_mm256_maskz_cvtps_pd(i8 zeroext %__U, <4 x float> %__A) local_unnamed_addr #0 {
522; X86-LABEL: test_mm256_maskz_cvtps_pd:
523; X86:       # %bb.0: # %entry
524; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
525; X86-NEXT:    kmovw %eax, %k1
526; X86-NEXT:    vcvtps2pd %xmm0, %ymm0 {%k1} {z}
527; X86-NEXT:    retl
528;
529; X64-LABEL: test_mm256_maskz_cvtps_pd:
530; X64:       # %bb.0: # %entry
531; X64-NEXT:    kmovw %edi, %k1
532; X64-NEXT:    vcvtps2pd %xmm0, %ymm0 {%k1} {z}
533; X64-NEXT:    retq
534entry:
535  %conv.i.i = fpext <4 x float> %__A to <4 x double>
536  %0 = bitcast i8 %__U to <8 x i1>
537  %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
538  %1 = select <4 x i1> %extract.i, <4 x double> %conv.i.i, <4 x double> zeroinitializer
539  ret <4 x double> %1
540}
541
542define <2 x i64> @test_mm_cvtps_epu32(<4 x float> %__A) {
543; CHECK-LABEL: test_mm_cvtps_epu32:
544; CHECK:       # %bb.0: # %entry
545; CHECK-NEXT:    vcvtps2udq %xmm0, %xmm0
546; CHECK-NEXT:    ret{{[l|q]}}
547entry:
548  %0 = tail call <4 x i32> @llvm.x86.avx512.mask.cvtps2udq.128(<4 x float> %__A, <4 x i32> zeroinitializer, i8 -1) #8
549  %1 = bitcast <4 x i32> %0 to <2 x i64>
550  ret <2 x i64> %1
551}
552
553define <2 x i64> @test_mm_mask_cvtps_epu32(<2 x i64> %__W, i8 zeroext %__U, <4 x float> %__A) {
554; X86-LABEL: test_mm_mask_cvtps_epu32:
555; X86:       # %bb.0: # %entry
556; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
557; X86-NEXT:    kmovw %eax, %k1
558; X86-NEXT:    vcvtps2udq %xmm1, %xmm0 {%k1}
559; X86-NEXT:    retl
560;
561; X64-LABEL: test_mm_mask_cvtps_epu32:
562; X64:       # %bb.0: # %entry
563; X64-NEXT:    kmovw %edi, %k1
564; X64-NEXT:    vcvtps2udq %xmm1, %xmm0 {%k1}
565; X64-NEXT:    retq
566entry:
567  %0 = bitcast <2 x i64> %__W to <4 x i32>
568  %1 = tail call <4 x i32> @llvm.x86.avx512.mask.cvtps2udq.128(<4 x float> %__A, <4 x i32> %0, i8 %__U) #8
569  %2 = bitcast <4 x i32> %1 to <2 x i64>
570  ret <2 x i64> %2
571}
572
573define <2 x i64> @test_mm_maskz_cvtps_epu32(i8 zeroext %__U, <4 x float> %__A) {
574; X86-LABEL: test_mm_maskz_cvtps_epu32:
575; X86:       # %bb.0: # %entry
576; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
577; X86-NEXT:    kmovw %eax, %k1
578; X86-NEXT:    vcvtps2udq %xmm0, %xmm0 {%k1} {z}
579; X86-NEXT:    retl
580;
581; X64-LABEL: test_mm_maskz_cvtps_epu32:
582; X64:       # %bb.0: # %entry
583; X64-NEXT:    kmovw %edi, %k1
584; X64-NEXT:    vcvtps2udq %xmm0, %xmm0 {%k1} {z}
585; X64-NEXT:    retq
586entry:
587  %0 = tail call <4 x i32> @llvm.x86.avx512.mask.cvtps2udq.128(<4 x float> %__A, <4 x i32> zeroinitializer, i8 %__U) #8
588  %1 = bitcast <4 x i32> %0 to <2 x i64>
589  ret <2 x i64> %1
590}
591
592define <4 x i64> @test_mm256_cvtps_epu32(<8 x float> %__A) {
593; CHECK-LABEL: test_mm256_cvtps_epu32:
594; CHECK:       # %bb.0: # %entry
595; CHECK-NEXT:    vcvtps2udq %ymm0, %ymm0
596; CHECK-NEXT:    ret{{[l|q]}}
597entry:
598  %0 = tail call <8 x i32> @llvm.x86.avx512.mask.cvtps2udq.256(<8 x float> %__A, <8 x i32> zeroinitializer, i8 -1) #8
599  %1 = bitcast <8 x i32> %0 to <4 x i64>
600  ret <4 x i64> %1
601}
602
603define <4 x i64> @test_mm256_mask_cvtps_epu32(<4 x i64> %__W, i8 zeroext %__U, <8 x float> %__A) {
604; X86-LABEL: test_mm256_mask_cvtps_epu32:
605; X86:       # %bb.0: # %entry
606; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
607; X86-NEXT:    kmovw %eax, %k1
608; X86-NEXT:    vcvtps2udq %ymm1, %ymm0 {%k1}
609; X86-NEXT:    retl
610;
611; X64-LABEL: test_mm256_mask_cvtps_epu32:
612; X64:       # %bb.0: # %entry
613; X64-NEXT:    kmovw %edi, %k1
614; X64-NEXT:    vcvtps2udq %ymm1, %ymm0 {%k1}
615; X64-NEXT:    retq
616entry:
617  %0 = bitcast <4 x i64> %__W to <8 x i32>
618  %1 = tail call <8 x i32> @llvm.x86.avx512.mask.cvtps2udq.256(<8 x float> %__A, <8 x i32> %0, i8 %__U) #8
619  %2 = bitcast <8 x i32> %1 to <4 x i64>
620  ret <4 x i64> %2
621}
622
623define <4 x i64> @test_mm256_maskz_cvtps_epu32(i8 zeroext %__U, <8 x float> %__A) {
624; X86-LABEL: test_mm256_maskz_cvtps_epu32:
625; X86:       # %bb.0: # %entry
626; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
627; X86-NEXT:    kmovw %eax, %k1
628; X86-NEXT:    vcvtps2udq %ymm0, %ymm0 {%k1} {z}
629; X86-NEXT:    retl
630;
631; X64-LABEL: test_mm256_maskz_cvtps_epu32:
632; X64:       # %bb.0: # %entry
633; X64-NEXT:    kmovw %edi, %k1
634; X64-NEXT:    vcvtps2udq %ymm0, %ymm0 {%k1} {z}
635; X64-NEXT:    retq
636entry:
637  %0 = tail call <8 x i32> @llvm.x86.avx512.mask.cvtps2udq.256(<8 x float> %__A, <8 x i32> zeroinitializer, i8 %__U) #8
638  %1 = bitcast <8 x i32> %0 to <4 x i64>
639  ret <4 x i64> %1
640}
641
642define <2 x i64> @test_mm_mask_cvttpd_epi32(<2 x i64> %__W, i8 zeroext %__U, <2 x double> %__A) {
643; X86-LABEL: test_mm_mask_cvttpd_epi32:
644; X86:       # %bb.0: # %entry
645; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
646; X86-NEXT:    kmovw %eax, %k1
647; X86-NEXT:    vcvttpd2dq %xmm1, %xmm0 {%k1}
648; X86-NEXT:    retl
649;
650; X64-LABEL: test_mm_mask_cvttpd_epi32:
651; X64:       # %bb.0: # %entry
652; X64-NEXT:    kmovw %edi, %k1
653; X64-NEXT:    vcvttpd2dq %xmm1, %xmm0 {%k1}
654; X64-NEXT:    retq
655entry:
656  %0 = bitcast <2 x i64> %__W to <4 x i32>
657  %1 = tail call <4 x i32> @llvm.x86.avx512.mask.cvttpd2dq.128(<2 x double> %__A, <4 x i32> %0, i8 %__U) #8
658  %2 = bitcast <4 x i32> %1 to <2 x i64>
659  ret <2 x i64> %2
660}
661
662define <2 x i64> @test_mm_maskz_cvttpd_epi32(i8 zeroext %__U, <2 x double> %__A) {
663; X86-LABEL: test_mm_maskz_cvttpd_epi32:
664; X86:       # %bb.0: # %entry
665; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
666; X86-NEXT:    kmovw %eax, %k1
667; X86-NEXT:    vcvttpd2dq %xmm0, %xmm0 {%k1} {z}
668; X86-NEXT:    retl
669;
670; X64-LABEL: test_mm_maskz_cvttpd_epi32:
671; X64:       # %bb.0: # %entry
672; X64-NEXT:    kmovw %edi, %k1
673; X64-NEXT:    vcvttpd2dq %xmm0, %xmm0 {%k1} {z}
674; X64-NEXT:    retq
675entry:
676  %0 = tail call <4 x i32> @llvm.x86.avx512.mask.cvttpd2dq.128(<2 x double> %__A, <4 x i32> zeroinitializer, i8 %__U) #8
677  %1 = bitcast <4 x i32> %0 to <2 x i64>
678  ret <2 x i64> %1
679}
680
681define <2 x i64> @test_mm256_mask_cvttpd_epi32(<2 x i64> %__W, i8 zeroext %__U, <4 x double> %__A) {
682; X86-LABEL: test_mm256_mask_cvttpd_epi32:
683; X86:       # %bb.0: # %entry
684; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
685; X86-NEXT:    kmovw %eax, %k1
686; X86-NEXT:    vcvttpd2dq %ymm1, %xmm0 {%k1}
687; X86-NEXT:    vzeroupper
688; X86-NEXT:    retl
689;
690; X64-LABEL: test_mm256_mask_cvttpd_epi32:
691; X64:       # %bb.0: # %entry
692; X64-NEXT:    kmovw %edi, %k1
693; X64-NEXT:    vcvttpd2dq %ymm1, %xmm0 {%k1}
694; X64-NEXT:    vzeroupper
695; X64-NEXT:    retq
696entry:
697  %0 = tail call <4 x i32> @llvm.x86.avx.cvtt.pd2dq.256(<4 x double> %__A) #8
698  %1 = bitcast <2 x i64> %__W to <4 x i32>
699  %2 = bitcast i8 %__U to <8 x i1>
700  %extract.i = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
701  %3 = select <4 x i1> %extract.i, <4 x i32> %0, <4 x i32> %1
702  %4 = bitcast <4 x i32> %3 to <2 x i64>
703  ret <2 x i64> %4
704}
705
706define <2 x i64> @test_mm256_maskz_cvttpd_epi32(i8 zeroext %__U, <4 x double> %__A) {
707; X86-LABEL: test_mm256_maskz_cvttpd_epi32:
708; X86:       # %bb.0: # %entry
709; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
710; X86-NEXT:    kmovw %eax, %k1
711; X86-NEXT:    vcvttpd2dq %ymm0, %xmm0 {%k1} {z}
712; X86-NEXT:    vzeroupper
713; X86-NEXT:    retl
714;
715; X64-LABEL: test_mm256_maskz_cvttpd_epi32:
716; X64:       # %bb.0: # %entry
717; X64-NEXT:    kmovw %edi, %k1
718; X64-NEXT:    vcvttpd2dq %ymm0, %xmm0 {%k1} {z}
719; X64-NEXT:    vzeroupper
720; X64-NEXT:    retq
721entry:
722  %0 = tail call <4 x i32> @llvm.x86.avx.cvtt.pd2dq.256(<4 x double> %__A) #8
723  %1 = bitcast i8 %__U to <8 x i1>
724  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
725  %2 = select <4 x i1> %extract.i, <4 x i32> %0, <4 x i32> zeroinitializer
726  %3 = bitcast <4 x i32> %2 to <2 x i64>
727  ret <2 x i64> %3
728}
729
730define <2 x i64> @test_mm_cvttpd_epu32(<2 x double> %__A) {
731; CHECK-LABEL: test_mm_cvttpd_epu32:
732; CHECK:       # %bb.0: # %entry
733; CHECK-NEXT:    vcvttpd2udq %xmm0, %xmm0
734; CHECK-NEXT:    ret{{[l|q]}}
735entry:
736  %0 = tail call <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.128(<2 x double> %__A, <4 x i32> zeroinitializer, i8 -1) #8
737  %1 = bitcast <4 x i32> %0 to <2 x i64>
738  ret <2 x i64> %1
739}
740
741define <2 x i64> @test_mm_mask_cvttpd_epu32(<2 x i64> %__W, i8 zeroext %__U, <2 x double> %__A) {
742; X86-LABEL: test_mm_mask_cvttpd_epu32:
743; X86:       # %bb.0: # %entry
744; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
745; X86-NEXT:    kmovw %eax, %k1
746; X86-NEXT:    vcvttpd2udq %xmm1, %xmm0 {%k1}
747; X86-NEXT:    retl
748;
749; X64-LABEL: test_mm_mask_cvttpd_epu32:
750; X64:       # %bb.0: # %entry
751; X64-NEXT:    kmovw %edi, %k1
752; X64-NEXT:    vcvttpd2udq %xmm1, %xmm0 {%k1}
753; X64-NEXT:    retq
754entry:
755  %0 = bitcast <2 x i64> %__W to <4 x i32>
756  %1 = tail call <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.128(<2 x double> %__A, <4 x i32> %0, i8 %__U) #8
757  %2 = bitcast <4 x i32> %1 to <2 x i64>
758  ret <2 x i64> %2
759}
760
761define <2 x i64> @test_mm_maskz_cvttpd_epu32(i8 zeroext %__U, <2 x double> %__A) {
762; X86-LABEL: test_mm_maskz_cvttpd_epu32:
763; X86:       # %bb.0: # %entry
764; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
765; X86-NEXT:    kmovw %eax, %k1
766; X86-NEXT:    vcvttpd2udq %xmm0, %xmm0 {%k1} {z}
767; X86-NEXT:    retl
768;
769; X64-LABEL: test_mm_maskz_cvttpd_epu32:
770; X64:       # %bb.0: # %entry
771; X64-NEXT:    kmovw %edi, %k1
772; X64-NEXT:    vcvttpd2udq %xmm0, %xmm0 {%k1} {z}
773; X64-NEXT:    retq
774entry:
775  %0 = tail call <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.128(<2 x double> %__A, <4 x i32> zeroinitializer, i8 %__U) #8
776  %1 = bitcast <4 x i32> %0 to <2 x i64>
777  ret <2 x i64> %1
778}
779
780define <2 x i64> @test_mm256_cvttpd_epu32(<4 x double> %__A) {
781; CHECK-LABEL: test_mm256_cvttpd_epu32:
782; CHECK:       # %bb.0: # %entry
783; CHECK-NEXT:    vcvttpd2udq %ymm0, %xmm0
784; CHECK-NEXT:    vzeroupper
785; CHECK-NEXT:    ret{{[l|q]}}
786entry:
787  %0 = tail call <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.256(<4 x double> %__A, <4 x i32> zeroinitializer, i8 -1) #8
788  %1 = bitcast <4 x i32> %0 to <2 x i64>
789  ret <2 x i64> %1
790}
791
792define <2 x i64> @test_mm256_mask_cvttpd_epu32(<2 x i64> %__W, i8 zeroext %__U, <4 x double> %__A) {
793; X86-LABEL: test_mm256_mask_cvttpd_epu32:
794; X86:       # %bb.0: # %entry
795; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
796; X86-NEXT:    kmovw %eax, %k1
797; X86-NEXT:    vcvttpd2udq %ymm1, %xmm0 {%k1}
798; X86-NEXT:    vzeroupper
799; X86-NEXT:    retl
800;
801; X64-LABEL: test_mm256_mask_cvttpd_epu32:
802; X64:       # %bb.0: # %entry
803; X64-NEXT:    kmovw %edi, %k1
804; X64-NEXT:    vcvttpd2udq %ymm1, %xmm0 {%k1}
805; X64-NEXT:    vzeroupper
806; X64-NEXT:    retq
807entry:
808  %0 = bitcast <2 x i64> %__W to <4 x i32>
809  %1 = tail call <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.256(<4 x double> %__A, <4 x i32> %0, i8 %__U) #8
810  %2 = bitcast <4 x i32> %1 to <2 x i64>
811  ret <2 x i64> %2
812}
813
814define <2 x i64> @test_mm256_maskz_cvttpd_epu32(i8 zeroext %__U, <4 x double> %__A) {
815; X86-LABEL: test_mm256_maskz_cvttpd_epu32:
816; X86:       # %bb.0: # %entry
817; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
818; X86-NEXT:    kmovw %eax, %k1
819; X86-NEXT:    vcvttpd2udq %ymm0, %xmm0 {%k1} {z}
820; X86-NEXT:    vzeroupper
821; X86-NEXT:    retl
822;
823; X64-LABEL: test_mm256_maskz_cvttpd_epu32:
824; X64:       # %bb.0: # %entry
825; X64-NEXT:    kmovw %edi, %k1
826; X64-NEXT:    vcvttpd2udq %ymm0, %xmm0 {%k1} {z}
827; X64-NEXT:    vzeroupper
828; X64-NEXT:    retq
829entry:
830  %0 = tail call <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.256(<4 x double> %__A, <4 x i32> zeroinitializer, i8 %__U) #8
831  %1 = bitcast <4 x i32> %0 to <2 x i64>
832  ret <2 x i64> %1
833}
834
835define <2 x i64> @test_mm_mask_cvttps_epi32(<2 x i64> %__W, i8 zeroext %__U, <4 x float> %__A) {
836; X86-LABEL: test_mm_mask_cvttps_epi32:
837; X86:       # %bb.0: # %entry
838; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
839; X86-NEXT:    kmovw %eax, %k1
840; X86-NEXT:    vcvttps2dq %xmm1, %xmm0 {%k1}
841; X86-NEXT:    retl
842;
843; X64-LABEL: test_mm_mask_cvttps_epi32:
844; X64:       # %bb.0: # %entry
845; X64-NEXT:    kmovw %edi, %k1
846; X64-NEXT:    vcvttps2dq %xmm1, %xmm0 {%k1}
847; X64-NEXT:    retq
848entry:
849  %0 = tail call <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float> %__A) #8
850  %1 = bitcast <2 x i64> %__W to <4 x i32>
851  %2 = bitcast i8 %__U to <8 x i1>
852  %extract.i = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
853  %3 = select <4 x i1> %extract.i, <4 x i32> %0, <4 x i32> %1
854  %4 = bitcast <4 x i32> %3 to <2 x i64>
855  ret <2 x i64> %4
856}
857
858define <2 x i64> @test_mm_maskz_cvttps_epi32(i8 zeroext %__U, <4 x float> %__A) {
859; X86-LABEL: test_mm_maskz_cvttps_epi32:
860; X86:       # %bb.0: # %entry
861; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
862; X86-NEXT:    kmovw %eax, %k1
863; X86-NEXT:    vcvttps2dq %xmm0, %xmm0 {%k1} {z}
864; X86-NEXT:    retl
865;
866; X64-LABEL: test_mm_maskz_cvttps_epi32:
867; X64:       # %bb.0: # %entry
868; X64-NEXT:    kmovw %edi, %k1
869; X64-NEXT:    vcvttps2dq %xmm0, %xmm0 {%k1} {z}
870; X64-NEXT:    retq
871entry:
872  %0 = tail call <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float> %__A) #8
873  %1 = bitcast i8 %__U to <8 x i1>
874  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
875  %2 = select <4 x i1> %extract.i, <4 x i32> %0, <4 x i32> zeroinitializer
876  %3 = bitcast <4 x i32> %2 to <2 x i64>
877  ret <2 x i64> %3
878}
879
880define <4 x i64> @test_mm256_mask_cvttps_epi32(<4 x i64> %__W, i8 zeroext %__U, <8 x float> %__A) {
881; X86-LABEL: test_mm256_mask_cvttps_epi32:
882; X86:       # %bb.0: # %entry
883; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
884; X86-NEXT:    kmovw %eax, %k1
885; X86-NEXT:    vcvttps2dq %ymm1, %ymm0 {%k1}
886; X86-NEXT:    retl
887;
888; X64-LABEL: test_mm256_mask_cvttps_epi32:
889; X64:       # %bb.0: # %entry
890; X64-NEXT:    kmovw %edi, %k1
891; X64-NEXT:    vcvttps2dq %ymm1, %ymm0 {%k1}
892; X64-NEXT:    retq
893entry:
894  %0 = tail call <8 x i32> @llvm.x86.avx.cvtt.ps2dq.256(<8 x float> %__A) #8
895  %1 = bitcast <4 x i64> %__W to <8 x i32>
896  %2 = bitcast i8 %__U to <8 x i1>
897  %3 = select <8 x i1> %2, <8 x i32> %0, <8 x i32> %1
898  %4 = bitcast <8 x i32> %3 to <4 x i64>
899  ret <4 x i64> %4
900}
901
902define <4 x i64> @test_mm256_maskz_cvttps_epi32(i8 zeroext %__U, <8 x float> %__A) {
903; X86-LABEL: test_mm256_maskz_cvttps_epi32:
904; X86:       # %bb.0: # %entry
905; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
906; X86-NEXT:    kmovw %eax, %k1
907; X86-NEXT:    vcvttps2dq %ymm0, %ymm0 {%k1} {z}
908; X86-NEXT:    retl
909;
910; X64-LABEL: test_mm256_maskz_cvttps_epi32:
911; X64:       # %bb.0: # %entry
912; X64-NEXT:    kmovw %edi, %k1
913; X64-NEXT:    vcvttps2dq %ymm0, %ymm0 {%k1} {z}
914; X64-NEXT:    retq
915entry:
916  %0 = tail call <8 x i32> @llvm.x86.avx.cvtt.ps2dq.256(<8 x float> %__A) #8
917  %1 = bitcast i8 %__U to <8 x i1>
918  %2 = select <8 x i1> %1, <8 x i32> %0, <8 x i32> zeroinitializer
919  %3 = bitcast <8 x i32> %2 to <4 x i64>
920  ret <4 x i64> %3
921}
922
923define <2 x i64> @test_mm_cvttps_epu32(<4 x float> %__A) {
924; CHECK-LABEL: test_mm_cvttps_epu32:
925; CHECK:       # %bb.0: # %entry
926; CHECK-NEXT:    vcvttps2udq %xmm0, %xmm0
927; CHECK-NEXT:    ret{{[l|q]}}
928entry:
929  %0 = tail call <4 x i32> @llvm.x86.avx512.mask.cvttps2udq.128(<4 x float> %__A, <4 x i32> zeroinitializer, i8 -1) #8
930  %1 = bitcast <4 x i32> %0 to <2 x i64>
931  ret <2 x i64> %1
932}
933
934define <2 x i64> @test_mm_mask_cvttps_epu32(<2 x i64> %__W, i8 zeroext %__U, <4 x float> %__A) {
935; X86-LABEL: test_mm_mask_cvttps_epu32:
936; X86:       # %bb.0: # %entry
937; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
938; X86-NEXT:    kmovw %eax, %k1
939; X86-NEXT:    vcvttps2udq %xmm1, %xmm0 {%k1}
940; X86-NEXT:    retl
941;
942; X64-LABEL: test_mm_mask_cvttps_epu32:
943; X64:       # %bb.0: # %entry
944; X64-NEXT:    kmovw %edi, %k1
945; X64-NEXT:    vcvttps2udq %xmm1, %xmm0 {%k1}
946; X64-NEXT:    retq
947entry:
948  %0 = bitcast <2 x i64> %__W to <4 x i32>
949  %1 = tail call <4 x i32> @llvm.x86.avx512.mask.cvttps2udq.128(<4 x float> %__A, <4 x i32> %0, i8 %__U) #8
950  %2 = bitcast <4 x i32> %1 to <2 x i64>
951  ret <2 x i64> %2
952}
953
954define <2 x i64> @test_mm_maskz_cvttps_epu32(i8 zeroext %__U, <4 x float> %__A) {
955; X86-LABEL: test_mm_maskz_cvttps_epu32:
956; X86:       # %bb.0: # %entry
957; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
958; X86-NEXT:    kmovw %eax, %k1
959; X86-NEXT:    vcvttps2udq %xmm0, %xmm0 {%k1} {z}
960; X86-NEXT:    retl
961;
962; X64-LABEL: test_mm_maskz_cvttps_epu32:
963; X64:       # %bb.0: # %entry
964; X64-NEXT:    kmovw %edi, %k1
965; X64-NEXT:    vcvttps2udq %xmm0, %xmm0 {%k1} {z}
966; X64-NEXT:    retq
967entry:
968  %0 = tail call <4 x i32> @llvm.x86.avx512.mask.cvttps2udq.128(<4 x float> %__A, <4 x i32> zeroinitializer, i8 %__U) #8
969  %1 = bitcast <4 x i32> %0 to <2 x i64>
970  ret <2 x i64> %1
971}
972
973define <4 x i64> @test_mm256_cvttps_epu32(<8 x float> %__A) {
974; CHECK-LABEL: test_mm256_cvttps_epu32:
975; CHECK:       # %bb.0: # %entry
976; CHECK-NEXT:    vcvttps2udq %ymm0, %ymm0
977; CHECK-NEXT:    ret{{[l|q]}}
978entry:
979  %0 = tail call <8 x i32> @llvm.x86.avx512.mask.cvttps2udq.256(<8 x float> %__A, <8 x i32> zeroinitializer, i8 -1) #8
980  %1 = bitcast <8 x i32> %0 to <4 x i64>
981  ret <4 x i64> %1
982}
983
984define <4 x i64> @test_mm256_mask_cvttps_epu32(<4 x i64> %__W, i8 zeroext %__U, <8 x float> %__A) {
985; X86-LABEL: test_mm256_mask_cvttps_epu32:
986; X86:       # %bb.0: # %entry
987; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
988; X86-NEXT:    kmovw %eax, %k1
989; X86-NEXT:    vcvttps2udq %ymm1, %ymm0 {%k1}
990; X86-NEXT:    retl
991;
992; X64-LABEL: test_mm256_mask_cvttps_epu32:
993; X64:       # %bb.0: # %entry
994; X64-NEXT:    kmovw %edi, %k1
995; X64-NEXT:    vcvttps2udq %ymm1, %ymm0 {%k1}
996; X64-NEXT:    retq
997entry:
998  %0 = bitcast <4 x i64> %__W to <8 x i32>
999  %1 = tail call <8 x i32> @llvm.x86.avx512.mask.cvttps2udq.256(<8 x float> %__A, <8 x i32> %0, i8 %__U) #8
1000  %2 = bitcast <8 x i32> %1 to <4 x i64>
1001  ret <4 x i64> %2
1002}
1003
1004define <4 x i64> @test_mm256_maskz_cvttps_epu32(i8 zeroext %__U, <8 x float> %__A) {
1005; X86-LABEL: test_mm256_maskz_cvttps_epu32:
1006; X86:       # %bb.0: # %entry
1007; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
1008; X86-NEXT:    kmovw %eax, %k1
1009; X86-NEXT:    vcvttps2udq %ymm0, %ymm0 {%k1} {z}
1010; X86-NEXT:    retl
1011;
1012; X64-LABEL: test_mm256_maskz_cvttps_epu32:
1013; X64:       # %bb.0: # %entry
1014; X64-NEXT:    kmovw %edi, %k1
1015; X64-NEXT:    vcvttps2udq %ymm0, %ymm0 {%k1} {z}
1016; X64-NEXT:    retq
1017entry:
1018  %0 = tail call <8 x i32> @llvm.x86.avx512.mask.cvttps2udq.256(<8 x float> %__A, <8 x i32> zeroinitializer, i8 %__U) #8
1019  %1 = bitcast <8 x i32> %0 to <4 x i64>
1020  ret <4 x i64> %1
1021}
1022
1023define <2 x double> @test_mm_cvtepu32_pd(<2 x i64> %__A) local_unnamed_addr #0 {
1024; CHECK-LABEL: test_mm_cvtepu32_pd:
1025; CHECK:       # %bb.0: # %entry
1026; CHECK-NEXT:    vcvtudq2pd %xmm0, %xmm0
1027; CHECK-NEXT:    ret{{[l|q]}}
1028entry:
1029  %0 = bitcast <2 x i64> %__A to <4 x i32>
1030  %shuffle.i = shufflevector <4 x i32> %0, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
1031  %conv.i = uitofp <2 x i32> %shuffle.i to <2 x double>
1032  ret <2 x double> %conv.i
1033}
1034
1035define <2 x double> @test_mm_mask_cvtepu32_pd(<2 x double> %__W, i8 zeroext %__U, <2 x i64> %__A) local_unnamed_addr #0 {
1036; X86-LABEL: test_mm_mask_cvtepu32_pd:
1037; X86:       # %bb.0: # %entry
1038; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
1039; X86-NEXT:    kmovw %eax, %k1
1040; X86-NEXT:    vcvtudq2pd %xmm1, %xmm0 {%k1}
1041; X86-NEXT:    retl
1042;
1043; X64-LABEL: test_mm_mask_cvtepu32_pd:
1044; X64:       # %bb.0: # %entry
1045; X64-NEXT:    kmovw %edi, %k1
1046; X64-NEXT:    vcvtudq2pd %xmm1, %xmm0 {%k1}
1047; X64-NEXT:    retq
1048entry:
1049  %0 = bitcast <2 x i64> %__A to <4 x i32>
1050  %shuffle.i.i = shufflevector <4 x i32> %0, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
1051  %conv.i.i = uitofp <2 x i32> %shuffle.i.i to <2 x double>
1052  %1 = bitcast i8 %__U to <8 x i1>
1053  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
1054  %2 = select <2 x i1> %extract.i, <2 x double> %conv.i.i, <2 x double> %__W
1055  ret <2 x double> %2
1056}
1057
1058define <2 x double> @test_mm_maskz_cvtepu32_pd(i8 zeroext %__U, <2 x i64> %__A) local_unnamed_addr #0 {
1059; X86-LABEL: test_mm_maskz_cvtepu32_pd:
1060; X86:       # %bb.0: # %entry
1061; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
1062; X86-NEXT:    kmovw %eax, %k1
1063; X86-NEXT:    vcvtudq2pd %xmm0, %xmm0 {%k1} {z}
1064; X86-NEXT:    retl
1065;
1066; X64-LABEL: test_mm_maskz_cvtepu32_pd:
1067; X64:       # %bb.0: # %entry
1068; X64-NEXT:    kmovw %edi, %k1
1069; X64-NEXT:    vcvtudq2pd %xmm0, %xmm0 {%k1} {z}
1070; X64-NEXT:    retq
1071entry:
1072  %0 = bitcast <2 x i64> %__A to <4 x i32>
1073  %shuffle.i.i = shufflevector <4 x i32> %0, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
1074  %conv.i.i = uitofp <2 x i32> %shuffle.i.i to <2 x double>
1075  %1 = bitcast i8 %__U to <8 x i1>
1076  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
1077  %2 = select <2 x i1> %extract.i, <2 x double> %conv.i.i, <2 x double> zeroinitializer
1078  ret <2 x double> %2
1079}
1080
1081define <4 x double> @test_mm256_cvtepu32_pd(<2 x i64> %__A) local_unnamed_addr #0 {
1082; CHECK-LABEL: test_mm256_cvtepu32_pd:
1083; CHECK:       # %bb.0: # %entry
1084; CHECK-NEXT:    vcvtudq2pd %xmm0, %ymm0
1085; CHECK-NEXT:    ret{{[l|q]}}
1086entry:
1087  %0 = bitcast <2 x i64> %__A to <4 x i32>
1088  %conv.i = uitofp <4 x i32> %0 to <4 x double>
1089  ret <4 x double> %conv.i
1090}
1091
1092define <4 x double> @test_mm256_mask_cvtepu32_pd(<4 x double> %__W, i8 zeroext %__U, <2 x i64> %__A) local_unnamed_addr #0 {
1093; X86-LABEL: test_mm256_mask_cvtepu32_pd:
1094; X86:       # %bb.0: # %entry
1095; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
1096; X86-NEXT:    kmovw %eax, %k1
1097; X86-NEXT:    vcvtudq2pd %xmm1, %ymm0 {%k1}
1098; X86-NEXT:    retl
1099;
1100; X64-LABEL: test_mm256_mask_cvtepu32_pd:
1101; X64:       # %bb.0: # %entry
1102; X64-NEXT:    kmovw %edi, %k1
1103; X64-NEXT:    vcvtudq2pd %xmm1, %ymm0 {%k1}
1104; X64-NEXT:    retq
1105entry:
1106  %0 = bitcast <2 x i64> %__A to <4 x i32>
1107  %conv.i.i = uitofp <4 x i32> %0 to <4 x double>
1108  %1 = bitcast i8 %__U to <8 x i1>
1109  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1110  %2 = select <4 x i1> %extract.i, <4 x double> %conv.i.i, <4 x double> %__W
1111  ret <4 x double> %2
1112}
1113
1114define <4 x double> @test_mm256_maskz_cvtepu32_pd(i8 zeroext %__U, <2 x i64> %__A) local_unnamed_addr #0 {
1115; X86-LABEL: test_mm256_maskz_cvtepu32_pd:
1116; X86:       # %bb.0: # %entry
1117; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
1118; X86-NEXT:    kmovw %eax, %k1
1119; X86-NEXT:    vcvtudq2pd %xmm0, %ymm0 {%k1} {z}
1120; X86-NEXT:    retl
1121;
1122; X64-LABEL: test_mm256_maskz_cvtepu32_pd:
1123; X64:       # %bb.0: # %entry
1124; X64-NEXT:    kmovw %edi, %k1
1125; X64-NEXT:    vcvtudq2pd %xmm0, %ymm0 {%k1} {z}
1126; X64-NEXT:    retq
1127entry:
1128  %0 = bitcast <2 x i64> %__A to <4 x i32>
1129  %conv.i.i = uitofp <4 x i32> %0 to <4 x double>
1130  %1 = bitcast i8 %__U to <8 x i1>
1131  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1132  %2 = select <4 x i1> %extract.i, <4 x double> %conv.i.i, <4 x double> zeroinitializer
1133  ret <4 x double> %2
1134}
1135
1136define <4 x float> @test_mm_cvtepu32_ps(<2 x i64> %__A) {
1137; CHECK-LABEL: test_mm_cvtepu32_ps:
1138; CHECK:       # %bb.0: # %entry
1139; CHECK-NEXT:    vcvtudq2ps %xmm0, %xmm0
1140; CHECK-NEXT:    ret{{[l|q]}}
1141entry:
1142  %0 = bitcast <2 x i64> %__A to <4 x i32>
1143  %conv.i = uitofp <4 x i32> %0 to <4 x float>
1144  ret <4 x float> %conv.i
1145}
1146
1147define <4 x float> @test_mm_mask_cvtepu32_ps(<4 x float> %__W, i8 zeroext %__U, <2 x i64> %__A) {
1148; X86-LABEL: test_mm_mask_cvtepu32_ps:
1149; X86:       # %bb.0: # %entry
1150; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
1151; X86-NEXT:    kmovw %eax, %k1
1152; X86-NEXT:    vcvtudq2ps %xmm1, %xmm0 {%k1}
1153; X86-NEXT:    retl
1154;
1155; X64-LABEL: test_mm_mask_cvtepu32_ps:
1156; X64:       # %bb.0: # %entry
1157; X64-NEXT:    kmovw %edi, %k1
1158; X64-NEXT:    vcvtudq2ps %xmm1, %xmm0 {%k1}
1159; X64-NEXT:    retq
1160entry:
1161  %0 = bitcast <2 x i64> %__A to <4 x i32>
1162  %conv.i.i = uitofp <4 x i32> %0 to <4 x float>
1163  %1 = bitcast i8 %__U to <8 x i1>
1164  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1165  %2 = select <4 x i1> %extract.i, <4 x float> %conv.i.i, <4 x float> %__W
1166  ret <4 x float> %2
1167}
1168
1169define <4 x float> @test_mm_maskz_cvtepu32_ps(i8 zeroext %__U, <2 x i64> %__A) {
1170; X86-LABEL: test_mm_maskz_cvtepu32_ps:
1171; X86:       # %bb.0: # %entry
1172; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
1173; X86-NEXT:    kmovw %eax, %k1
1174; X86-NEXT:    vcvtudq2ps %xmm0, %xmm0 {%k1} {z}
1175; X86-NEXT:    retl
1176;
1177; X64-LABEL: test_mm_maskz_cvtepu32_ps:
1178; X64:       # %bb.0: # %entry
1179; X64-NEXT:    kmovw %edi, %k1
1180; X64-NEXT:    vcvtudq2ps %xmm0, %xmm0 {%k1} {z}
1181; X64-NEXT:    retq
1182entry:
1183  %0 = bitcast <2 x i64> %__A to <4 x i32>
1184  %conv.i.i = uitofp <4 x i32> %0 to <4 x float>
1185  %1 = bitcast i8 %__U to <8 x i1>
1186  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1187  %2 = select <4 x i1> %extract.i, <4 x float> %conv.i.i, <4 x float> zeroinitializer
1188  ret <4 x float> %2
1189}
1190
1191define <8 x float> @test_mm256_cvtepu32_ps(<4 x i64> %__A) {
1192; CHECK-LABEL: test_mm256_cvtepu32_ps:
1193; CHECK:       # %bb.0: # %entry
1194; CHECK-NEXT:    vcvtudq2ps %ymm0, %ymm0
1195; CHECK-NEXT:    ret{{[l|q]}}
1196entry:
1197  %0 = bitcast <4 x i64> %__A to <8 x i32>
1198  %conv.i = uitofp <8 x i32> %0 to <8 x float>
1199  ret <8 x float> %conv.i
1200}
1201
1202define <8 x float> @test_mm256_mask_cvtepu32_ps(<8 x float> %__W, i8 zeroext %__U, <4 x i64> %__A) {
1203; X86-LABEL: test_mm256_mask_cvtepu32_ps:
1204; X86:       # %bb.0: # %entry
1205; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
1206; X86-NEXT:    kmovw %eax, %k1
1207; X86-NEXT:    vcvtudq2ps %ymm1, %ymm0 {%k1}
1208; X86-NEXT:    retl
1209;
1210; X64-LABEL: test_mm256_mask_cvtepu32_ps:
1211; X64:       # %bb.0: # %entry
1212; X64-NEXT:    kmovw %edi, %k1
1213; X64-NEXT:    vcvtudq2ps %ymm1, %ymm0 {%k1}
1214; X64-NEXT:    retq
1215entry:
1216  %0 = bitcast <4 x i64> %__A to <8 x i32>
1217  %conv.i.i = uitofp <8 x i32> %0 to <8 x float>
1218  %1 = bitcast i8 %__U to <8 x i1>
1219  %2 = select <8 x i1> %1, <8 x float> %conv.i.i, <8 x float> %__W
1220  ret <8 x float> %2
1221}
1222
1223define <8 x float> @test_mm256_maskz_cvtepu32_ps(i8 zeroext %__U, <4 x i64> %__A) {
1224; X86-LABEL: test_mm256_maskz_cvtepu32_ps:
1225; X86:       # %bb.0: # %entry
1226; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
1227; X86-NEXT:    kmovw %eax, %k1
1228; X86-NEXT:    vcvtudq2ps %ymm0, %ymm0 {%k1} {z}
1229; X86-NEXT:    retl
1230;
1231; X64-LABEL: test_mm256_maskz_cvtepu32_ps:
1232; X64:       # %bb.0: # %entry
1233; X64-NEXT:    kmovw %edi, %k1
1234; X64-NEXT:    vcvtudq2ps %ymm0, %ymm0 {%k1} {z}
1235; X64-NEXT:    retq
1236entry:
1237  %0 = bitcast <4 x i64> %__A to <8 x i32>
1238  %conv.i.i = uitofp <8 x i32> %0 to <8 x float>
1239  %1 = bitcast i8 %__U to <8 x i1>
1240  %2 = select <8 x i1> %1, <8 x float> %conv.i.i, <8 x float> zeroinitializer
1241  ret <8 x float> %2
1242}
1243
1244define <8 x float> @test_mm256_shuffle_f32x4(<8 x float> %__A, <8 x float> %__B) {
1245; CHECK-LABEL: test_mm256_shuffle_f32x4:
1246; CHECK:       # %bb.0: # %entry
1247; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
1248; CHECK-NEXT:    ret{{[l|q]}}
1249entry:
1250  %shuffle = shufflevector <8 x float> %__A, <8 x float> %__B, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
1251  ret <8 x float> %shuffle
1252}
1253
1254define <8 x float> @test_mm256_mask_shuffle_f32x4(<8 x float> %__W, i8 zeroext %__U, <8 x float> %__A, <8 x float> %__B) {
1255; X86-LABEL: test_mm256_mask_shuffle_f32x4:
1256; X86:       # %bb.0: # %entry
1257; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
1258; X86-NEXT:    kmovw %eax, %k1
1259; X86-NEXT:    vshuff32x4 {{.*#+}} ymm0 {%k1} = ymm1[4,5,6,7],ymm2[4,5,6,7]
1260; X86-NEXT:    retl
1261;
1262; X64-LABEL: test_mm256_mask_shuffle_f32x4:
1263; X64:       # %bb.0: # %entry
1264; X64-NEXT:    kmovw %edi, %k1
1265; X64-NEXT:    vshuff32x4 {{.*#+}} ymm0 {%k1} = ymm1[4,5,6,7],ymm2[4,5,6,7]
1266; X64-NEXT:    retq
1267entry:
1268  %shuffle = shufflevector <8 x float> %__A, <8 x float> %__B, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
1269  %0 = bitcast i8 %__U to <8 x i1>
1270  %1 = select <8 x i1> %0, <8 x float> %shuffle, <8 x float> %__W
1271  ret <8 x float> %1
1272}
1273
1274define <8 x float> @test_mm256_maskz_shuffle_f32x4(i8 zeroext %__U, <8 x float> %__A, <8 x float> %__B) {
1275; X86-LABEL: test_mm256_maskz_shuffle_f32x4:
1276; X86:       # %bb.0: # %entry
1277; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
1278; X86-NEXT:    kmovw %eax, %k1
1279; X86-NEXT:    vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7]
1280; X86-NEXT:    retl
1281;
1282; X64-LABEL: test_mm256_maskz_shuffle_f32x4:
1283; X64:       # %bb.0: # %entry
1284; X64-NEXT:    kmovw %edi, %k1
1285; X64-NEXT:    vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7]
1286; X64-NEXT:    retq
1287entry:
1288  %shuffle = shufflevector <8 x float> %__A, <8 x float> %__B, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
1289  %0 = bitcast i8 %__U to <8 x i1>
1290  %1 = select <8 x i1> %0, <8 x float> %shuffle, <8 x float> zeroinitializer
1291  ret <8 x float> %1
1292}
1293
1294define <4 x double> @test_mm256_shuffle_f64x2(<4 x double> %__A, <4 x double> %__B) {
1295; CHECK-LABEL: test_mm256_shuffle_f64x2:
1296; CHECK:       # %bb.0: # %entry
1297; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
1298; CHECK-NEXT:    ret{{[l|q]}}
1299entry:
1300  %shuffle = shufflevector <4 x double> %__A, <4 x double> %__B, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
1301  ret <4 x double> %shuffle
1302}
1303
1304define <4 x double> @test_mm256_mask_shuffle_f64x2(<4 x double> %__W, i8 zeroext %__U, <4 x double> %__A, <4 x double> %__B) {
1305; X86-LABEL: test_mm256_mask_shuffle_f64x2:
1306; X86:       # %bb.0: # %entry
1307; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
1308; X86-NEXT:    kmovw %eax, %k1
1309; X86-NEXT:    vshuff64x2 {{.*#+}} ymm0 {%k1} = ymm1[2,3],ymm2[2,3]
1310; X86-NEXT:    retl
1311;
1312; X64-LABEL: test_mm256_mask_shuffle_f64x2:
1313; X64:       # %bb.0: # %entry
1314; X64-NEXT:    kmovw %edi, %k1
1315; X64-NEXT:    vshuff64x2 {{.*#+}} ymm0 {%k1} = ymm1[2,3],ymm2[2,3]
1316; X64-NEXT:    retq
1317entry:
1318  %shuffle = shufflevector <4 x double> %__A, <4 x double> %__B, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
1319  %0 = bitcast i8 %__U to <8 x i1>
1320  %extract = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1321  %1 = select <4 x i1> %extract, <4 x double> %shuffle, <4 x double> %__W
1322  ret <4 x double> %1
1323}
1324
1325define <4 x double> @test_mm256_maskz_shuffle_f64x2(i8 zeroext %__U, <4 x double> %__A, <4 x double> %__B) {
1326; X86-LABEL: test_mm256_maskz_shuffle_f64x2:
1327; X86:       # %bb.0: # %entry
1328; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
1329; X86-NEXT:    kmovw %eax, %k1
1330; X86-NEXT:    vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3]
1331; X86-NEXT:    retl
1332;
1333; X64-LABEL: test_mm256_maskz_shuffle_f64x2:
1334; X64:       # %bb.0: # %entry
1335; X64-NEXT:    kmovw %edi, %k1
1336; X64-NEXT:    vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3]
1337; X64-NEXT:    retq
1338entry:
1339  %shuffle = shufflevector <4 x double> %__A, <4 x double> %__B, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
1340  %0 = bitcast i8 %__U to <8 x i1>
1341  %extract = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1342  %1 = select <4 x i1> %extract, <4 x double> %shuffle, <4 x double> zeroinitializer
1343  ret <4 x double> %1
1344}
1345
1346define <4 x i64> @test_mm256_shuffle_i32x4(<4 x i64> %__A, <4 x i64> %__B) {
1347; CHECK-LABEL: test_mm256_shuffle_i32x4:
1348; CHECK:       # %bb.0: # %entry
1349; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
1350; CHECK-NEXT:    ret{{[l|q]}}
1351entry:
1352  %shuffle = shufflevector <4 x i64> %__A, <4 x i64> %__B, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
1353  ret <4 x i64> %shuffle
1354}
1355
1356define <4 x i64> @test_mm256_mask_shuffle_i32x4(<4 x i64> %__W, i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
1357; X86-LABEL: test_mm256_mask_shuffle_i32x4:
1358; X86:       # %bb.0: # %entry
1359; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
1360; X86-NEXT:    kmovw %eax, %k1
1361; X86-NEXT:    vshufi32x4 {{.*#+}} ymm0 {%k1} = ymm1[4,5,6,7],ymm2[4,5,6,7]
1362; X86-NEXT:    retl
1363;
1364; X64-LABEL: test_mm256_mask_shuffle_i32x4:
1365; X64:       # %bb.0: # %entry
1366; X64-NEXT:    kmovw %edi, %k1
1367; X64-NEXT:    vshufi32x4 {{.*#+}} ymm0 {%k1} = ymm1[4,5,6,7],ymm2[4,5,6,7]
1368; X64-NEXT:    retq
1369entry:
1370  %shuffle = shufflevector <4 x i64> %__A, <4 x i64> %__B, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
1371  %0 = bitcast <4 x i64> %shuffle to <8 x i32>
1372  %1 = bitcast <4 x i64> %__W to <8 x i32>
1373  %2 = bitcast i8 %__U to <8 x i1>
1374  %3 = select <8 x i1> %2, <8 x i32> %0, <8 x i32> %1
1375  %4 = bitcast <8 x i32> %3 to <4 x i64>
1376  ret <4 x i64> %4
1377}
1378
1379define <4 x i64> @test_mm256_maskz_shuffle_i32x4(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
1380; X86-LABEL: test_mm256_maskz_shuffle_i32x4:
1381; X86:       # %bb.0: # %entry
1382; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
1383; X86-NEXT:    kmovw %eax, %k1
1384; X86-NEXT:    vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7]
1385; X86-NEXT:    retl
1386;
1387; X64-LABEL: test_mm256_maskz_shuffle_i32x4:
1388; X64:       # %bb.0: # %entry
1389; X64-NEXT:    kmovw %edi, %k1
1390; X64-NEXT:    vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7]
1391; X64-NEXT:    retq
1392entry:
1393  %shuffle = shufflevector <4 x i64> %__A, <4 x i64> %__B, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
1394  %0 = bitcast <4 x i64> %shuffle to <8 x i32>
1395  %1 = bitcast i8 %__U to <8 x i1>
1396  %2 = select <8 x i1> %1, <8 x i32> %0, <8 x i32> zeroinitializer
1397  %3 = bitcast <8 x i32> %2 to <4 x i64>
1398  ret <4 x i64> %3
1399}
1400
1401define <4 x i64> @test_mm256_shuffle_i64x2(<4 x i64> %__A, <4 x i64> %__B) {
1402; CHECK-LABEL: test_mm256_shuffle_i64x2:
1403; CHECK:       # %bb.0: # %entry
1404; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
1405; CHECK-NEXT:    ret{{[l|q]}}
1406entry:
1407  %shuffle = shufflevector <4 x i64> %__A, <4 x i64> %__B, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
1408  ret <4 x i64> %shuffle
1409}
1410
1411define <4 x i64> @test_mm256_mask_shuffle_i64x2(<4 x i64> %__W, i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
1412; X86-LABEL: test_mm256_mask_shuffle_i64x2:
1413; X86:       # %bb.0: # %entry
1414; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
1415; X86-NEXT:    kmovw %eax, %k1
1416; X86-NEXT:    vshufi64x2 {{.*#+}} ymm0 {%k1} = ymm1[2,3],ymm2[2,3]
1417; X86-NEXT:    retl
1418;
1419; X64-LABEL: test_mm256_mask_shuffle_i64x2:
1420; X64:       # %bb.0: # %entry
1421; X64-NEXT:    kmovw %edi, %k1
1422; X64-NEXT:    vshufi64x2 {{.*#+}} ymm0 {%k1} = ymm1[2,3],ymm2[2,3]
1423; X64-NEXT:    retq
1424entry:
1425  %shuffle = shufflevector <4 x i64> %__A, <4 x i64> %__B, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
1426  %0 = bitcast i8 %__U to <8 x i1>
1427  %extract = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1428  %1 = select <4 x i1> %extract, <4 x i64> %shuffle, <4 x i64> %__W
1429  ret <4 x i64> %1
1430}
1431
1432define <4 x i64> @test_mm256_maskz_shuffle_i64x2(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
1433; X86-LABEL: test_mm256_maskz_shuffle_i64x2:
1434; X86:       # %bb.0: # %entry
1435; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
1436; X86-NEXT:    kmovw %eax, %k1
1437; X86-NEXT:    vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3]
1438; X86-NEXT:    retl
1439;
1440; X64-LABEL: test_mm256_maskz_shuffle_i64x2:
1441; X64:       # %bb.0: # %entry
1442; X64-NEXT:    kmovw %edi, %k1
1443; X64-NEXT:    vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3]
1444; X64-NEXT:    retq
1445entry:
1446  %shuffle = shufflevector <4 x i64> %__A, <4 x i64> %__B, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
1447  %0 = bitcast i8 %__U to <8 x i1>
1448  %extract = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1449  %1 = select <4 x i1> %extract, <4 x i64> %shuffle, <4 x i64> zeroinitializer
1450  ret <4 x i64> %1
1451}
1452
1453define zeroext i8 @test_mm_test_epi32_mask(<2 x i64> %__A, <2 x i64> %__B) {
1454; CHECK-LABEL: test_mm_test_epi32_mask:
1455; CHECK:       # %bb.0: # %entry
1456; CHECK-NEXT:    vptestmd %xmm0, %xmm1, %k0
1457; CHECK-NEXT:    kmovw %k0, %eax
1458; CHECK-NEXT:    movzbl %al, %eax
1459; CHECK-NEXT:    ret{{[l|q]}}
1460entry:
1461  %and.i.i = and <2 x i64> %__B, %__A
1462  %0 = bitcast <2 x i64> %and.i.i to <4 x i32>
1463  %1 = icmp ne <4 x i32> %0, zeroinitializer
1464  %2 = shufflevector <4 x i1> %1, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1465  %3 = bitcast <8 x i1> %2 to i8
1466  ret i8 %3
1467}
1468
1469define zeroext i8 @test_mm_mask_test_epi32_mask(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) {
1470; X86-LABEL: test_mm_mask_test_epi32_mask:
1471; X86:       # %bb.0: # %entry
1472; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
1473; X86-NEXT:    kmovw %eax, %k1
1474; X86-NEXT:    vptestmd %xmm0, %xmm1, %k0 {%k1}
1475; X86-NEXT:    kmovw %k0, %eax
1476; X86-NEXT:    movzbl %al, %eax
1477; X86-NEXT:    retl
1478;
1479; X64-LABEL: test_mm_mask_test_epi32_mask:
1480; X64:       # %bb.0: # %entry
1481; X64-NEXT:    kmovw %edi, %k1
1482; X64-NEXT:    vptestmd %xmm0, %xmm1, %k0 {%k1}
1483; X64-NEXT:    kmovw %k0, %eax
1484; X64-NEXT:    movzbl %al, %eax
1485; X64-NEXT:    retq
1486entry:
1487  %and.i.i = and <2 x i64> %__B, %__A
1488  %0 = bitcast <2 x i64> %and.i.i to <4 x i32>
1489  %1 = icmp ne <4 x i32> %0, zeroinitializer
1490  %2 = bitcast i8 %__U to <8 x i1>
1491  %extract.i = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1492  %3 = and <4 x i1> %1, %extract.i
1493  %4 = shufflevector <4 x i1> %3, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1494  %5 = bitcast <8 x i1> %4 to i8
1495  ret i8 %5
1496}
1497
1498define zeroext i8 @test_mm256_test_epi32_mask(<4 x i64> %__A, <4 x i64> %__B) {
1499; CHECK-LABEL: test_mm256_test_epi32_mask:
1500; CHECK:       # %bb.0: # %entry
1501; CHECK-NEXT:    vptestmd %ymm0, %ymm1, %k0
1502; CHECK-NEXT:    kmovw %k0, %eax
1503; CHECK-NEXT:    movzbl %al, %eax
1504; CHECK-NEXT:    vzeroupper
1505; CHECK-NEXT:    ret{{[l|q]}}
1506entry:
1507  %and.i.i = and <4 x i64> %__B, %__A
1508  %0 = bitcast <4 x i64> %and.i.i to <8 x i32>
1509  %1 = icmp ne <8 x i32> %0, zeroinitializer
1510  %2 = bitcast <8 x i1> %1 to i8
1511  ret i8 %2
1512}
1513
1514define zeroext i8 @test_mm256_mask_test_epi32_mask(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
1515; X86-LABEL: test_mm256_mask_test_epi32_mask:
1516; X86:       # %bb.0: # %entry
1517; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
1518; X86-NEXT:    kmovw %eax, %k1
1519; X86-NEXT:    vptestmd %ymm0, %ymm1, %k0 {%k1}
1520; X86-NEXT:    kmovw %k0, %eax
1521; X86-NEXT:    movzbl %al, %eax
1522; X86-NEXT:    vzeroupper
1523; X86-NEXT:    retl
1524;
1525; X64-LABEL: test_mm256_mask_test_epi32_mask:
1526; X64:       # %bb.0: # %entry
1527; X64-NEXT:    kmovw %edi, %k1
1528; X64-NEXT:    vptestmd %ymm0, %ymm1, %k0 {%k1}
1529; X64-NEXT:    kmovw %k0, %eax
1530; X64-NEXT:    movzbl %al, %eax
1531; X64-NEXT:    vzeroupper
1532; X64-NEXT:    retq
1533entry:
1534  %and.i.i = and <4 x i64> %__B, %__A
1535  %0 = bitcast <4 x i64> %and.i.i to <8 x i32>
1536  %1 = icmp ne <8 x i32> %0, zeroinitializer
1537  %2 = bitcast i8 %__U to <8 x i1>
1538  %3 = and <8 x i1> %1, %2
1539  %4 = bitcast <8 x i1> %3 to i8
1540  ret i8 %4
1541}
1542
1543define zeroext i8 @test_mm_test_epi64_mask(<2 x i64> %__A, <2 x i64> %__B) {
1544; CHECK-LABEL: test_mm_test_epi64_mask:
1545; CHECK:       # %bb.0: # %entry
1546; CHECK-NEXT:    vptestmq %xmm0, %xmm1, %k0
1547; CHECK-NEXT:    kmovw %k0, %eax
1548; CHECK-NEXT:    movzbl %al, %eax
1549; CHECK-NEXT:    ret{{[l|q]}}
1550entry:
1551  %and.i.i = and <2 x i64> %__B, %__A
1552  %0 = icmp ne <2 x i64> %and.i.i, zeroinitializer
1553  %1 = shufflevector <2 x i1> %0, <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
1554  %2 = bitcast <8 x i1> %1 to i8
1555  ret i8 %2
1556}
1557
1558define zeroext i8 @test_mm_mask_test_epi64_mask(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) {
1559; X86-LABEL: test_mm_mask_test_epi64_mask:
1560; X86:       # %bb.0: # %entry
1561; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
1562; X86-NEXT:    kmovw %eax, %k1
1563; X86-NEXT:    vptestmq %xmm0, %xmm1, %k0 {%k1}
1564; X86-NEXT:    kmovw %k0, %eax
1565; X86-NEXT:    movzbl %al, %eax
1566; X86-NEXT:    retl
1567;
1568; X64-LABEL: test_mm_mask_test_epi64_mask:
1569; X64:       # %bb.0: # %entry
1570; X64-NEXT:    kmovw %edi, %k1
1571; X64-NEXT:    vptestmq %xmm0, %xmm1, %k0 {%k1}
1572; X64-NEXT:    kmovw %k0, %eax
1573; X64-NEXT:    movzbl %al, %eax
1574; X64-NEXT:    retq
1575entry:
1576  %and.i.i = and <2 x i64> %__B, %__A
1577  %0 = icmp ne <2 x i64> %and.i.i, zeroinitializer
1578  %1 = bitcast i8 %__U to <8 x i1>
1579  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
1580  %2 = and <2 x i1> %0, %extract.i
1581  %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
1582  %4 = bitcast <8 x i1> %3 to i8
1583  ret i8 %4
1584}
1585
1586define zeroext i8 @test_mm256_test_epi64_mask(<4 x i64> %__A, <4 x i64> %__B) {
1587; CHECK-LABEL: test_mm256_test_epi64_mask:
1588; CHECK:       # %bb.0: # %entry
1589; CHECK-NEXT:    vptestmq %ymm0, %ymm1, %k0
1590; CHECK-NEXT:    kmovw %k0, %eax
1591; CHECK-NEXT:    movzbl %al, %eax
1592; CHECK-NEXT:    vzeroupper
1593; CHECK-NEXT:    ret{{[l|q]}}
1594entry:
1595  %and.i.i = and <4 x i64> %__B, %__A
1596  %0 = icmp ne <4 x i64> %and.i.i, zeroinitializer
1597  %1 = shufflevector <4 x i1> %0, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1598  %2 = bitcast <8 x i1> %1 to i8
1599  ret i8 %2
1600}
1601
1602define zeroext i8 @test_mm256_mask_test_epi64_mask(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
1603; X86-LABEL: test_mm256_mask_test_epi64_mask:
1604; X86:       # %bb.0: # %entry
1605; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
1606; X86-NEXT:    kmovw %eax, %k1
1607; X86-NEXT:    vptestmq %ymm0, %ymm1, %k0 {%k1}
1608; X86-NEXT:    kmovw %k0, %eax
1609; X86-NEXT:    movzbl %al, %eax
1610; X86-NEXT:    vzeroupper
1611; X86-NEXT:    retl
1612;
1613; X64-LABEL: test_mm256_mask_test_epi64_mask:
1614; X64:       # %bb.0: # %entry
1615; X64-NEXT:    kmovw %edi, %k1
1616; X64-NEXT:    vptestmq %ymm0, %ymm1, %k0 {%k1}
1617; X64-NEXT:    kmovw %k0, %eax
1618; X64-NEXT:    movzbl %al, %eax
1619; X64-NEXT:    vzeroupper
1620; X64-NEXT:    retq
1621entry:
1622  %and.i.i = and <4 x i64> %__B, %__A
1623  %0 = icmp ne <4 x i64> %and.i.i, zeroinitializer
1624  %1 = bitcast i8 %__U to <8 x i1>
1625  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1626  %2 = and <4 x i1> %0, %extract.i
1627  %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1628  %4 = bitcast <8 x i1> %3 to i8
1629  ret i8 %4
1630}
1631
1632define zeroext i8 @test_mm_testn_epi32_mask(<2 x i64> %__A, <2 x i64> %__B) {
1633; CHECK-LABEL: test_mm_testn_epi32_mask:
1634; CHECK:       # %bb.0: # %entry
1635; CHECK-NEXT:    vptestnmd %xmm0, %xmm1, %k0
1636; CHECK-NEXT:    kmovw %k0, %eax
1637; CHECK-NEXT:    movzbl %al, %eax
1638; CHECK-NEXT:    ret{{[l|q]}}
1639entry:
1640  %and.i.i = and <2 x i64> %__B, %__A
1641  %0 = bitcast <2 x i64> %and.i.i to <4 x i32>
1642  %1 = icmp eq <4 x i32> %0, zeroinitializer
1643  %2 = shufflevector <4 x i1> %1, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1644  %3 = bitcast <8 x i1> %2 to i8
1645  ret i8 %3
1646}
1647
1648define zeroext i8 @test_mm_mask_testn_epi32_mask(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) {
1649; X86-LABEL: test_mm_mask_testn_epi32_mask:
1650; X86:       # %bb.0: # %entry
1651; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
1652; X86-NEXT:    kmovw %eax, %k1
1653; X86-NEXT:    vptestnmd %xmm0, %xmm1, %k0 {%k1}
1654; X86-NEXT:    kmovw %k0, %eax
1655; X86-NEXT:    movzbl %al, %eax
1656; X86-NEXT:    retl
1657;
1658; X64-LABEL: test_mm_mask_testn_epi32_mask:
1659; X64:       # %bb.0: # %entry
1660; X64-NEXT:    kmovw %edi, %k1
1661; X64-NEXT:    vptestnmd %xmm0, %xmm1, %k0 {%k1}
1662; X64-NEXT:    kmovw %k0, %eax
1663; X64-NEXT:    movzbl %al, %eax
1664; X64-NEXT:    retq
1665entry:
1666  %and.i.i = and <2 x i64> %__B, %__A
1667  %0 = bitcast <2 x i64> %and.i.i to <4 x i32>
1668  %1 = icmp eq <4 x i32> %0, zeroinitializer
1669  %2 = bitcast i8 %__U to <8 x i1>
1670  %extract.i = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1671  %3 = and <4 x i1> %1, %extract.i
1672  %4 = shufflevector <4 x i1> %3, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1673  %5 = bitcast <8 x i1> %4 to i8
1674  ret i8 %5
1675}
1676
1677define zeroext i8 @test_mm256_testn_epi32_mask(<4 x i64> %__A, <4 x i64> %__B) {
1678; CHECK-LABEL: test_mm256_testn_epi32_mask:
1679; CHECK:       # %bb.0: # %entry
1680; CHECK-NEXT:    vptestnmd %ymm0, %ymm1, %k0
1681; CHECK-NEXT:    kmovw %k0, %eax
1682; CHECK-NEXT:    movzbl %al, %eax
1683; CHECK-NEXT:    vzeroupper
1684; CHECK-NEXT:    ret{{[l|q]}}
1685entry:
1686  %and.i.i = and <4 x i64> %__B, %__A
1687  %0 = bitcast <4 x i64> %and.i.i to <8 x i32>
1688  %1 = icmp eq <8 x i32> %0, zeroinitializer
1689  %2 = bitcast <8 x i1> %1 to i8
1690  ret i8 %2
1691}
1692
1693define zeroext i8 @test_mm256_mask_testn_epi32_mask(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
1694; X86-LABEL: test_mm256_mask_testn_epi32_mask:
1695; X86:       # %bb.0: # %entry
1696; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
1697; X86-NEXT:    kmovw %eax, %k1
1698; X86-NEXT:    vptestnmd %ymm0, %ymm1, %k0 {%k1}
1699; X86-NEXT:    kmovw %k0, %eax
1700; X86-NEXT:    movzbl %al, %eax
1701; X86-NEXT:    vzeroupper
1702; X86-NEXT:    retl
1703;
1704; X64-LABEL: test_mm256_mask_testn_epi32_mask:
1705; X64:       # %bb.0: # %entry
1706; X64-NEXT:    kmovw %edi, %k1
1707; X64-NEXT:    vptestnmd %ymm0, %ymm1, %k0 {%k1}
1708; X64-NEXT:    kmovw %k0, %eax
1709; X64-NEXT:    movzbl %al, %eax
1710; X64-NEXT:    vzeroupper
1711; X64-NEXT:    retq
1712entry:
1713  %and.i.i = and <4 x i64> %__B, %__A
1714  %0 = bitcast <4 x i64> %and.i.i to <8 x i32>
1715  %1 = icmp eq <8 x i32> %0, zeroinitializer
1716  %2 = bitcast i8 %__U to <8 x i1>
1717  %3 = and <8 x i1> %1, %2
1718  %4 = bitcast <8 x i1> %3 to i8
1719  ret i8 %4
1720}
1721
1722define zeroext i8 @test_mm_testn_epi64_mask(<2 x i64> %__A, <2 x i64> %__B) {
1723; CHECK-LABEL: test_mm_testn_epi64_mask:
1724; CHECK:       # %bb.0: # %entry
1725; CHECK-NEXT:    vptestnmq %xmm0, %xmm1, %k0
1726; CHECK-NEXT:    kmovw %k0, %eax
1727; CHECK-NEXT:    movzbl %al, %eax
1728; CHECK-NEXT:    ret{{[l|q]}}
1729entry:
1730  %and.i.i = and <2 x i64> %__B, %__A
1731  %0 = icmp eq <2 x i64> %and.i.i, zeroinitializer
1732  %1 = shufflevector <2 x i1> %0, <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
1733  %2 = bitcast <8 x i1> %1 to i8
1734  ret i8 %2
1735}
1736
1737define zeroext i8 @test_mm_mask_testn_epi64_mask(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) {
1738; X86-LABEL: test_mm_mask_testn_epi64_mask:
1739; X86:       # %bb.0: # %entry
1740; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
1741; X86-NEXT:    kmovw %eax, %k1
1742; X86-NEXT:    vptestnmq %xmm0, %xmm1, %k0 {%k1}
1743; X86-NEXT:    kmovw %k0, %eax
1744; X86-NEXT:    movzbl %al, %eax
1745; X86-NEXT:    retl
1746;
1747; X64-LABEL: test_mm_mask_testn_epi64_mask:
1748; X64:       # %bb.0: # %entry
1749; X64-NEXT:    kmovw %edi, %k1
1750; X64-NEXT:    vptestnmq %xmm0, %xmm1, %k0 {%k1}
1751; X64-NEXT:    kmovw %k0, %eax
1752; X64-NEXT:    movzbl %al, %eax
1753; X64-NEXT:    retq
1754entry:
1755  %and.i.i = and <2 x i64> %__B, %__A
1756  %0 = icmp eq <2 x i64> %and.i.i, zeroinitializer
1757  %1 = bitcast i8 %__U to <8 x i1>
1758  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
1759  %2 = and <2 x i1> %0, %extract.i
1760  %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
1761  %4 = bitcast <8 x i1> %3 to i8
1762  ret i8 %4
1763}
1764
1765define zeroext i8 @test_mm256_testn_epi64_mask(<4 x i64> %__A, <4 x i64> %__B) {
1766; CHECK-LABEL: test_mm256_testn_epi64_mask:
1767; CHECK:       # %bb.0: # %entry
1768; CHECK-NEXT:    vptestnmq %ymm0, %ymm1, %k0
1769; CHECK-NEXT:    kmovw %k0, %eax
1770; CHECK-NEXT:    movzbl %al, %eax
1771; CHECK-NEXT:    vzeroupper
1772; CHECK-NEXT:    ret{{[l|q]}}
1773entry:
1774  %and.i.i = and <4 x i64> %__B, %__A
1775  %0 = icmp eq <4 x i64> %and.i.i, zeroinitializer
1776  %1 = shufflevector <4 x i1> %0, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1777  %2 = bitcast <8 x i1> %1 to i8
1778  ret i8 %2
1779}
1780
1781define zeroext i8 @test_mm256_mask_testn_epi64_mask(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
1782; X86-LABEL: test_mm256_mask_testn_epi64_mask:
1783; X86:       # %bb.0: # %entry
1784; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
1785; X86-NEXT:    kmovw %eax, %k1
1786; X86-NEXT:    vptestnmq %ymm0, %ymm1, %k0 {%k1}
1787; X86-NEXT:    kmovw %k0, %eax
1788; X86-NEXT:    movzbl %al, %eax
1789; X86-NEXT:    vzeroupper
1790; X86-NEXT:    retl
1791;
1792; X64-LABEL: test_mm256_mask_testn_epi64_mask:
1793; X64:       # %bb.0: # %entry
1794; X64-NEXT:    kmovw %edi, %k1
1795; X64-NEXT:    vptestnmq %ymm0, %ymm1, %k0 {%k1}
1796; X64-NEXT:    kmovw %k0, %eax
1797; X64-NEXT:    movzbl %al, %eax
1798; X64-NEXT:    vzeroupper
1799; X64-NEXT:    retq
1800entry:
1801  %and.i.i = and <4 x i64> %__B, %__A
1802  %0 = icmp eq <4 x i64> %and.i.i, zeroinitializer
1803  %1 = bitcast i8 %__U to <8 x i1>
1804  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1805  %2 = and <4 x i1> %0, %extract.i
1806  %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1807  %4 = bitcast <8 x i1> %3 to i8
1808  ret i8 %4
1809}
1810
1811define <2 x i64> @test_mm_mask_set1_epi32(<2 x i64> %__O, i8 zeroext %__M)  {
1812; X86-LABEL: test_mm_mask_set1_epi32:
1813; X86:       # %bb.0: # %entry
1814; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
1815; X86-NEXT:    kmovw %eax, %k1
1816; X86-NEXT:    vpbroadcastd {{\.LCPI.*}}, %xmm0 {%k1}
1817; X86-NEXT:    retl
1818;
1819; X64-LABEL: test_mm_mask_set1_epi32:
1820; X64:       # %bb.0: # %entry
1821; X64-NEXT:    kmovw %edi, %k1
1822; X64-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm0 {%k1}
1823; X64-NEXT:    retq
1824entry:
1825  %0 = bitcast <2 x i64> %__O to <4 x i32>
1826  %1 = bitcast i8 %__M to <8 x i1>
1827  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1828  %2 = select <4 x i1> %extract.i, <4 x i32> <i32 5, i32 5, i32 5, i32 5>, <4 x i32> %0
1829  %3 = bitcast <4 x i32> %2 to <2 x i64>
1830  ret <2 x i64> %3
1831}
1832
1833define <2 x i64> @test_mm_maskz_set1_epi32(i8 zeroext %__M) {
1834; X86-LABEL: test_mm_maskz_set1_epi32:
1835; X86:       # %bb.0: # %entry
1836; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
1837; X86-NEXT:    kmovw %eax, %k1
1838; X86-NEXT:    vpbroadcastd {{\.LCPI.*}}, %xmm0 {%k1} {z}
1839; X86-NEXT:    retl
1840;
1841; X64-LABEL: test_mm_maskz_set1_epi32:
1842; X64:       # %bb.0: # %entry
1843; X64-NEXT:    kmovw %edi, %k1
1844; X64-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm0 {%k1} {z}
1845; X64-NEXT:    retq
1846entry:
1847  %0 = bitcast i8 %__M to <8 x i1>
1848  %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1849  %1 = select <4 x i1> %extract.i, <4 x i32> <i32 5, i32 5, i32 5, i32 5>, <4 x i32> zeroinitializer
1850  %2 = bitcast <4 x i32> %1 to <2 x i64>
1851  ret <2 x i64> %2
1852}
1853
1854define <4 x i64> @test_mm256_mask_set1_epi32(<4 x i64> %__O, i8 zeroext %__M)  {
1855; X86-LABEL: test_mm256_mask_set1_epi32:
1856; X86:       # %bb.0: # %entry
1857; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
1858; X86-NEXT:    kmovw %eax, %k1
1859; X86-NEXT:    vpbroadcastd {{\.LCPI.*}}, %ymm0 {%k1}
1860; X86-NEXT:    retl
1861;
1862; X64-LABEL: test_mm256_mask_set1_epi32:
1863; X64:       # %bb.0: # %entry
1864; X64-NEXT:    kmovw %edi, %k1
1865; X64-NEXT:    vpbroadcastd {{.*}}(%rip), %ymm0 {%k1}
1866; X64-NEXT:    retq
1867entry:
1868  %0 = bitcast <4 x i64> %__O to <8 x i32>
1869  %1 = bitcast i8 %__M to <8 x i1>
1870  %2 = select <8 x i1> %1, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>, <8 x i32> %0
1871  %3 = bitcast <8 x i32> %2 to <4 x i64>
1872  ret <4 x i64> %3
1873}
1874
1875define <4 x i64> @test_mm256_maskz_set1_epi32(i8 zeroext %__M)  {
1876; X86-LABEL: test_mm256_maskz_set1_epi32:
1877; X86:       # %bb.0: # %entry
1878; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
1879; X86-NEXT:    kmovw %eax, %k1
1880; X86-NEXT:    vpbroadcastd {{\.LCPI.*}}, %ymm0 {%k1} {z}
1881; X86-NEXT:    retl
1882;
1883; X64-LABEL: test_mm256_maskz_set1_epi32:
1884; X64:       # %bb.0: # %entry
1885; X64-NEXT:    kmovw %edi, %k1
1886; X64-NEXT:    vpbroadcastd {{.*}}(%rip), %ymm0 {%k1} {z}
1887; X64-NEXT:    retq
1888entry:
1889  %0 = bitcast i8 %__M to <8 x i1>
1890  %1 = select <8 x i1> %0, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>, <8 x i32> zeroinitializer
1891  %2 = bitcast <8 x i32> %1 to <4 x i64>
1892  ret <4 x i64> %2
1893}
1894
1895define <2 x i64> @test_mm_mask_set1_epi64(<2 x i64> %__O, i8 zeroext %__M, i64 %__A)  {
1896; X86-LABEL: test_mm_mask_set1_epi64:
1897; X86:       # %bb.0: # %entry
1898; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
1899; X86-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1900; X86-NEXT:    vpinsrd $1, {{[0-9]+}}(%esp), %xmm1, %xmm1
1901; X86-NEXT:    kmovw %eax, %k1
1902; X86-NEXT:    vpbroadcastq %xmm1, %xmm0 {%k1}
1903; X86-NEXT:    retl
1904;
1905; X64-LABEL: test_mm_mask_set1_epi64:
1906; X64:       # %bb.0: # %entry
1907; X64-NEXT:    kmovw %edi, %k1
1908; X64-NEXT:    vpbroadcastq %rsi, %xmm0 {%k1}
1909; X64-NEXT:    retq
1910entry:
1911  %vecinit.i.i.i = insertelement <2 x i64> undef, i64 %__A, i32 0
1912  %vecinit1.i.i.i = shufflevector <2 x i64> %vecinit.i.i.i, <2 x i64> undef, <2 x i32> zeroinitializer
1913  %0 = bitcast i8 %__M to <8 x i1>
1914  %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
1915  %1 = select <2 x i1> %extract.i, <2 x i64> %vecinit1.i.i.i, <2 x i64> %__O
1916  ret <2 x i64> %1
1917}
1918
1919define <2 x i64> @test_mm_maskz_set1_epi64(i8 zeroext %__M, i64 %__A)  {
1920; X86-LABEL: test_mm_maskz_set1_epi64:
1921; X86:       # %bb.0: # %entry
1922; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
1923; X86-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1924; X86-NEXT:    vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
1925; X86-NEXT:    kmovw %eax, %k1
1926; X86-NEXT:    vpbroadcastq %xmm0, %xmm0 {%k1} {z}
1927; X86-NEXT:    retl
1928;
1929; X64-LABEL: test_mm_maskz_set1_epi64:
1930; X64:       # %bb.0: # %entry
1931; X64-NEXT:    kmovw %edi, %k1
1932; X64-NEXT:    vpbroadcastq %rsi, %xmm0 {%k1} {z}
1933; X64-NEXT:    retq
1934entry:
1935  %vecinit.i.i.i = insertelement <2 x i64> undef, i64 %__A, i32 0
1936  %vecinit1.i.i.i = shufflevector <2 x i64> %vecinit.i.i.i, <2 x i64> undef, <2 x i32> zeroinitializer
1937  %0 = bitcast i8 %__M to <8 x i1>
1938  %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
1939  %1 = select <2 x i1> %extract.i, <2 x i64> %vecinit1.i.i.i, <2 x i64> zeroinitializer
1940  ret <2 x i64> %1
1941}
1942
1943
1944define <4 x i64> @test_mm256_mask_set1_epi64(<4 x i64> %__O, i8 zeroext %__M, i64 %__A) {
1945; X86-LABEL: test_mm256_mask_set1_epi64:
1946; X86:       # %bb.0: # %entry
1947; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
1948; X86-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1949; X86-NEXT:    vpinsrd $1, {{[0-9]+}}(%esp), %xmm1, %xmm1
1950; X86-NEXT:    kmovw %eax, %k1
1951; X86-NEXT:    vpbroadcastq %xmm1, %ymm0 {%k1}
1952; X86-NEXT:    retl
1953;
1954; X64-LABEL: test_mm256_mask_set1_epi64:
1955; X64:       # %bb.0: # %entry
1956; X64-NEXT:    kmovw %edi, %k1
1957; X64-NEXT:    vpbroadcastq %rsi, %ymm0 {%k1}
1958; X64-NEXT:    retq
1959entry:
1960  %vecinit.i.i = insertelement <4 x i64> undef, i64 %__A, i32 0
1961  %vecinit3.i.i = shufflevector <4 x i64> %vecinit.i.i, <4 x i64> undef, <4 x i32> zeroinitializer
1962  %0 = bitcast i8 %__M to <8 x i1>
1963  %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1964  %1 = select <4 x i1> %extract.i, <4 x i64> %vecinit3.i.i, <4 x i64> %__O
1965  ret <4 x i64> %1
1966}
1967
1968define <4 x i64> @test_mm256_maskz_set1_epi64(i8 zeroext %__M, i64 %__A)  {
1969; X86-LABEL: test_mm256_maskz_set1_epi64:
1970; X86:       # %bb.0: # %entry
1971; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
1972; X86-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1973; X86-NEXT:    vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
1974; X86-NEXT:    kmovw %eax, %k1
1975; X86-NEXT:    vpbroadcastq %xmm0, %ymm0 {%k1} {z}
1976; X86-NEXT:    retl
1977;
1978; X64-LABEL: test_mm256_maskz_set1_epi64:
1979; X64:       # %bb.0: # %entry
1980; X64-NEXT:    kmovw %edi, %k1
1981; X64-NEXT:    vpbroadcastq %rsi, %ymm0 {%k1} {z}
1982; X64-NEXT:    retq
1983entry:
1984  %vecinit.i.i = insertelement <4 x i64> undef, i64 %__A, i32 0
1985  %vecinit3.i.i = shufflevector <4 x i64> %vecinit.i.i, <4 x i64> undef, <4 x i32> zeroinitializer
1986  %0 = bitcast i8 %__M to <8 x i1>
1987  %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1988  %1 = select <4 x i1> %extract.i, <4 x i64> %vecinit3.i.i, <4 x i64> zeroinitializer
1989  ret <4 x i64> %1
1990}
1991
1992define <2 x i64> @test_mm_broadcastd_epi32(<2 x i64> %a0) {
1993; CHECK-LABEL: test_mm_broadcastd_epi32:
1994; CHECK:       # %bb.0:
1995; CHECK-NEXT:    vbroadcastss %xmm0, %xmm0
1996; CHECK-NEXT:    ret{{[l|q]}}
1997  %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
1998  %res0 = shufflevector <4 x i32> %arg0, <4 x i32> undef, <4 x i32> zeroinitializer
1999  %res1 = bitcast <4 x i32> %res0 to <2 x i64>
2000  ret <2 x i64> %res1
2001}
2002
2003define <2 x i64> @test_mm_mask_broadcastd_epi32(<2 x i64> %__O, i8 zeroext %__M, <2 x i64> %__A) {
2004; X86-LABEL: test_mm_mask_broadcastd_epi32:
2005; X86:       # %bb.0: # %entry
2006; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
2007; X86-NEXT:    kmovw %eax, %k1
2008; X86-NEXT:    vpbroadcastd %xmm1, %xmm0 {%k1}
2009; X86-NEXT:    retl
2010;
2011; X64-LABEL: test_mm_mask_broadcastd_epi32:
2012; X64:       # %bb.0: # %entry
2013; X64-NEXT:    kmovw %edi, %k1
2014; X64-NEXT:    vpbroadcastd %xmm1, %xmm0 {%k1}
2015; X64-NEXT:    retq
2016entry:
2017  %0 = bitcast <2 x i64> %__A to <4 x i32>
2018  %shuffle.i.i = shufflevector <4 x i32> %0, <4 x i32> undef, <4 x i32> zeroinitializer
2019  %1 = bitcast <2 x i64> %__O to <4 x i32>
2020  %2 = bitcast i8 %__M to <8 x i1>
2021  %extract.i = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2022  %3 = select <4 x i1> %extract.i, <4 x i32> %shuffle.i.i, <4 x i32> %1
2023  %4 = bitcast <4 x i32> %3 to <2 x i64>
2024  ret <2 x i64> %4
2025}
2026
2027define <2 x i64> @test_mm_maskz_broadcastd_epi32(i8 zeroext %__M, <2 x i64> %__A) {
2028; X86-LABEL: test_mm_maskz_broadcastd_epi32:
2029; X86:       # %bb.0: # %entry
2030; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
2031; X86-NEXT:    kmovw %eax, %k1
2032; X86-NEXT:    vpbroadcastd %xmm0, %xmm0 {%k1} {z}
2033; X86-NEXT:    retl
2034;
2035; X64-LABEL: test_mm_maskz_broadcastd_epi32:
2036; X64:       # %bb.0: # %entry
2037; X64-NEXT:    kmovw %edi, %k1
2038; X64-NEXT:    vpbroadcastd %xmm0, %xmm0 {%k1} {z}
2039; X64-NEXT:    retq
2040entry:
2041  %0 = bitcast <2 x i64> %__A to <4 x i32>
2042  %shuffle.i.i = shufflevector <4 x i32> %0, <4 x i32> undef, <4 x i32> zeroinitializer
2043  %1 = bitcast i8 %__M to <8 x i1>
2044  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2045  %2 = select <4 x i1> %extract.i, <4 x i32> %shuffle.i.i, <4 x i32> zeroinitializer
2046  %3 = bitcast <4 x i32> %2 to <2 x i64>
2047  ret <2 x i64> %3
2048}
2049
2050define <4 x i64> @test_mm256_broadcastd_epi32(<2 x i64> %a0) {
2051; CHECK-LABEL: test_mm256_broadcastd_epi32:
2052; CHECK:       # %bb.0:
2053; CHECK-NEXT:    vbroadcastss %xmm0, %ymm0
2054; CHECK-NEXT:    ret{{[l|q]}}
2055  %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
2056  %res0 = shufflevector <4 x i32> %arg0, <4 x i32> undef, <8 x i32> zeroinitializer
2057  %res1 = bitcast <8 x i32> %res0 to <4 x i64>
2058  ret <4 x i64> %res1
2059}
2060
2061define <4 x i64> @test_mm256_mask_broadcastd_epi32(<4 x i64> %a0, i8 %a1, <2 x i64> %a2) {
2062; X86-LABEL: test_mm256_mask_broadcastd_epi32:
2063; X86:       # %bb.0:
2064; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
2065; X86-NEXT:    kmovw %eax, %k1
2066; X86-NEXT:    vpbroadcastd %xmm1, %ymm0 {%k1}
2067; X86-NEXT:    retl
2068;
2069; X64-LABEL: test_mm256_mask_broadcastd_epi32:
2070; X64:       # %bb.0:
2071; X64-NEXT:    kmovw %edi, %k1
2072; X64-NEXT:    vpbroadcastd %xmm1, %ymm0 {%k1}
2073; X64-NEXT:    retq
2074  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2075  %arg1 = bitcast i8 %a1 to <8 x i1>
2076  %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
2077  %res0 = shufflevector <4 x i32> %arg2, <4 x i32> undef, <8 x i32> zeroinitializer
2078  %res1 = select <8 x i1> %arg1, <8 x i32> %res0, <8 x i32> %arg0
2079  %res2 = bitcast <8 x i32> %res1 to <4 x i64>
2080  ret <4 x i64> %res2
2081}
2082
2083define <4 x i64> @test_mm256_maskz_broadcastd_epi32(i8 %a0, <2 x i64> %a1) {
2084; X86-LABEL: test_mm256_maskz_broadcastd_epi32:
2085; X86:       # %bb.0:
2086; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
2087; X86-NEXT:    kmovw %eax, %k1
2088; X86-NEXT:    vpbroadcastd %xmm0, %ymm0 {%k1} {z}
2089; X86-NEXT:    retl
2090;
2091; X64-LABEL: test_mm256_maskz_broadcastd_epi32:
2092; X64:       # %bb.0:
2093; X64-NEXT:    kmovw %edi, %k1
2094; X64-NEXT:    vpbroadcastd %xmm0, %ymm0 {%k1} {z}
2095; X64-NEXT:    retq
2096  %arg0 = bitcast i8 %a0 to <8 x i1>
2097  %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
2098  %res0 = shufflevector <4 x i32> %arg1, <4 x i32> undef, <8 x i32> zeroinitializer
2099  %res1 = select <8 x i1> %arg0, <8 x i32> %res0, <8 x i32> zeroinitializer
2100  %res2 = bitcast <8 x i32> %res1 to <4 x i64>
2101  ret <4 x i64> %res2
2102}
2103
2104define <2 x i64> @test_mm_broadcastq_epi64(<2 x i64> %a0) {
2105; CHECK-LABEL: test_mm_broadcastq_epi64:
2106; CHECK:       # %bb.0:
2107; CHECK-NEXT:    vpbroadcastq %xmm0, %xmm0
2108; CHECK-NEXT:    ret{{[l|q]}}
2109  %res = shufflevector <2 x i64> %a0, <2 x i64> undef, <2 x i32> zeroinitializer
2110  ret <2 x i64> %res
2111}
2112
2113define <2 x i64> @test_mm_mask_broadcastq_epi64(<2 x i64> %__O, i8 zeroext %__M, <2 x i64> %__A) {
2114; X86-LABEL: test_mm_mask_broadcastq_epi64:
2115; X86:       # %bb.0: # %entry
2116; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
2117; X86-NEXT:    kmovw %eax, %k1
2118; X86-NEXT:    vpbroadcastq %xmm1, %xmm0 {%k1}
2119; X86-NEXT:    retl
2120;
2121; X64-LABEL: test_mm_mask_broadcastq_epi64:
2122; X64:       # %bb.0: # %entry
2123; X64-NEXT:    kmovw %edi, %k1
2124; X64-NEXT:    vpbroadcastq %xmm1, %xmm0 {%k1}
2125; X64-NEXT:    retq
2126entry:
2127  %shuffle.i.i = shufflevector <2 x i64> %__A, <2 x i64> undef, <2 x i32> zeroinitializer
2128  %0 = bitcast i8 %__M to <8 x i1>
2129  %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
2130  %1 = select <2 x i1> %extract.i, <2 x i64> %shuffle.i.i, <2 x i64> %__O
2131  ret <2 x i64> %1
2132}
2133
2134define <2 x i64> @test_mm_maskz_broadcastq_epi64(i8 zeroext %__M, <2 x i64> %__A) {
2135; X86-LABEL: test_mm_maskz_broadcastq_epi64:
2136; X86:       # %bb.0: # %entry
2137; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
2138; X86-NEXT:    kmovw %eax, %k1
2139; X86-NEXT:    vpbroadcastq %xmm0, %xmm0 {%k1} {z}
2140; X86-NEXT:    retl
2141;
2142; X64-LABEL: test_mm_maskz_broadcastq_epi64:
2143; X64:       # %bb.0: # %entry
2144; X64-NEXT:    kmovw %edi, %k1
2145; X64-NEXT:    vpbroadcastq %xmm0, %xmm0 {%k1} {z}
2146; X64-NEXT:    retq
2147entry:
2148  %shuffle.i.i = shufflevector <2 x i64> %__A, <2 x i64> undef, <2 x i32> zeroinitializer
2149  %0 = bitcast i8 %__M to <8 x i1>
2150  %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
2151  %1 = select <2 x i1> %extract.i, <2 x i64> %shuffle.i.i, <2 x i64> zeroinitializer
2152  ret <2 x i64> %1
2153}
2154
2155define <4 x i64> @test_mm256_broadcastq_epi64(<2 x i64> %a0) {
2156; CHECK-LABEL: test_mm256_broadcastq_epi64:
2157; CHECK:       # %bb.0:
2158; CHECK-NEXT:    vbroadcastsd %xmm0, %ymm0
2159; CHECK-NEXT:    ret{{[l|q]}}
2160  %res = shufflevector <2 x i64> %a0, <2 x i64> undef, <4 x i32> zeroinitializer
2161  ret <4 x i64> %res
2162}
2163
2164define <4 x i64> @test_mm256_mask_broadcastq_epi64(<4 x i64> %__O, i8 zeroext %__M, <2 x i64> %__A) {
2165; X86-LABEL: test_mm256_mask_broadcastq_epi64:
2166; X86:       # %bb.0: # %entry
2167; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
2168; X86-NEXT:    kmovw %eax, %k1
2169; X86-NEXT:    vpbroadcastq %xmm1, %ymm0 {%k1}
2170; X86-NEXT:    retl
2171;
2172; X64-LABEL: test_mm256_mask_broadcastq_epi64:
2173; X64:       # %bb.0: # %entry
2174; X64-NEXT:    kmovw %edi, %k1
2175; X64-NEXT:    vpbroadcastq %xmm1, %ymm0 {%k1}
2176; X64-NEXT:    retq
2177entry:
2178  %shuffle.i.i = shufflevector <2 x i64> %__A, <2 x i64> undef, <4 x i32> zeroinitializer
2179  %0 = bitcast i8 %__M to <8 x i1>
2180  %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2181  %1 = select <4 x i1> %extract.i, <4 x i64> %shuffle.i.i, <4 x i64> %__O
2182  ret <4 x i64> %1
2183}
2184
2185define <4 x i64> @test_mm256_maskz_broadcastq_epi64(i8 zeroext %__M, <2 x i64> %__A) {
2186; X86-LABEL: test_mm256_maskz_broadcastq_epi64:
2187; X86:       # %bb.0: # %entry
2188; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
2189; X86-NEXT:    kmovw %eax, %k1
2190; X86-NEXT:    vpbroadcastq %xmm0, %ymm0 {%k1} {z}
2191; X86-NEXT:    retl
2192;
2193; X64-LABEL: test_mm256_maskz_broadcastq_epi64:
2194; X64:       # %bb.0: # %entry
2195; X64-NEXT:    kmovw %edi, %k1
2196; X64-NEXT:    vpbroadcastq %xmm0, %ymm0 {%k1} {z}
2197; X64-NEXT:    retq
2198entry:
2199  %shuffle.i.i = shufflevector <2 x i64> %__A, <2 x i64> undef, <4 x i32> zeroinitializer
2200  %0 = bitcast i8 %__M to <8 x i1>
2201  %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2202  %1 = select <4 x i1> %extract.i, <4 x i64> %shuffle.i.i, <4 x i64> zeroinitializer
2203  ret <4 x i64> %1
2204}
2205
2206define <4 x double> @test_mm256_broadcastsd_pd(<2 x double> %a0) {
2207; CHECK-LABEL: test_mm256_broadcastsd_pd:
2208; CHECK:       # %bb.0:
2209; CHECK-NEXT:    vbroadcastsd %xmm0, %ymm0
2210; CHECK-NEXT:    ret{{[l|q]}}
2211  %res = shufflevector <2 x double> %a0, <2 x double> undef, <4 x i32> zeroinitializer
2212  ret <4 x double> %res
2213}
2214
2215define <4 x double> @test_mm256_mask_broadcastsd_pd(<4 x double> %__O, i8 zeroext %__M, <2 x double> %__A) {
2216; X86-LABEL: test_mm256_mask_broadcastsd_pd:
2217; X86:       # %bb.0: # %entry
2218; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
2219; X86-NEXT:    kmovw %eax, %k1
2220; X86-NEXT:    vbroadcastsd %xmm1, %ymm0 {%k1}
2221; X86-NEXT:    retl
2222;
2223; X64-LABEL: test_mm256_mask_broadcastsd_pd:
2224; X64:       # %bb.0: # %entry
2225; X64-NEXT:    kmovw %edi, %k1
2226; X64-NEXT:    vbroadcastsd %xmm1, %ymm0 {%k1}
2227; X64-NEXT:    retq
2228entry:
2229  %shuffle.i.i = shufflevector <2 x double> %__A, <2 x double> undef, <4 x i32> zeroinitializer
2230  %0 = bitcast i8 %__M to <8 x i1>
2231  %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2232  %1 = select <4 x i1> %extract.i, <4 x double> %shuffle.i.i, <4 x double> %__O
2233  ret <4 x double> %1
2234}
2235
2236define <4 x double> @test_mm256_maskz_broadcastsd_pd(i8 zeroext %__M, <2 x double> %__A) {
2237; X86-LABEL: test_mm256_maskz_broadcastsd_pd:
2238; X86:       # %bb.0: # %entry
2239; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
2240; X86-NEXT:    kmovw %eax, %k1
2241; X86-NEXT:    vbroadcastsd %xmm0, %ymm0 {%k1} {z}
2242; X86-NEXT:    retl
2243;
2244; X64-LABEL: test_mm256_maskz_broadcastsd_pd:
2245; X64:       # %bb.0: # %entry
2246; X64-NEXT:    kmovw %edi, %k1
2247; X64-NEXT:    vbroadcastsd %xmm0, %ymm0 {%k1} {z}
2248; X64-NEXT:    retq
2249entry:
2250  %shuffle.i.i = shufflevector <2 x double> %__A, <2 x double> undef, <4 x i32> zeroinitializer
2251  %0 = bitcast i8 %__M to <8 x i1>
2252  %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2253  %1 = select <4 x i1> %extract.i, <4 x double> %shuffle.i.i, <4 x double> zeroinitializer
2254  ret <4 x double> %1
2255}
2256
2257define <4 x float> @test_mm_broadcastss_ps(<4 x float> %a0) {
2258; CHECK-LABEL: test_mm_broadcastss_ps:
2259; CHECK:       # %bb.0:
2260; CHECK-NEXT:    vbroadcastss %xmm0, %xmm0
2261; CHECK-NEXT:    ret{{[l|q]}}
2262  %res = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> zeroinitializer
2263  ret <4 x float> %res
2264}
2265
2266define <4 x float> @test_mm_mask_broadcastss_ps(<4 x float> %__O, i8 zeroext %__M, <4 x float> %__A) {
2267; X86-LABEL: test_mm_mask_broadcastss_ps:
2268; X86:       # %bb.0: # %entry
2269; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
2270; X86-NEXT:    kmovw %eax, %k1
2271; X86-NEXT:    vbroadcastss %xmm1, %xmm0 {%k1}
2272; X86-NEXT:    retl
2273;
2274; X64-LABEL: test_mm_mask_broadcastss_ps:
2275; X64:       # %bb.0: # %entry
2276; X64-NEXT:    kmovw %edi, %k1
2277; X64-NEXT:    vbroadcastss %xmm1, %xmm0 {%k1}
2278; X64-NEXT:    retq
2279entry:
2280  %shuffle.i.i = shufflevector <4 x float> %__A, <4 x float> undef, <4 x i32> zeroinitializer
2281  %0 = bitcast i8 %__M to <8 x i1>
2282  %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2283  %1 = select <4 x i1> %extract.i, <4 x float> %shuffle.i.i, <4 x float> %__O
2284  ret <4 x float> %1
2285}
2286
2287define <4 x float> @test_mm_maskz_broadcastss_ps(i8 zeroext %__M, <4 x float> %__A) {
2288; X86-LABEL: test_mm_maskz_broadcastss_ps:
2289; X86:       # %bb.0: # %entry
2290; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
2291; X86-NEXT:    kmovw %eax, %k1
2292; X86-NEXT:    vbroadcastss %xmm0, %xmm0 {%k1} {z}
2293; X86-NEXT:    retl
2294;
2295; X64-LABEL: test_mm_maskz_broadcastss_ps:
2296; X64:       # %bb.0: # %entry
2297; X64-NEXT:    kmovw %edi, %k1
2298; X64-NEXT:    vbroadcastss %xmm0, %xmm0 {%k1} {z}
2299; X64-NEXT:    retq
2300entry:
2301  %shuffle.i.i = shufflevector <4 x float> %__A, <4 x float> undef, <4 x i32> zeroinitializer
2302  %0 = bitcast i8 %__M to <8 x i1>
2303  %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2304  %1 = select <4 x i1> %extract.i, <4 x float> %shuffle.i.i, <4 x float> zeroinitializer
2305  ret <4 x float> %1
2306}
2307
2308define <8 x float> @test_mm256_broadcastss_ps(<4 x float> %a0) {
2309; CHECK-LABEL: test_mm256_broadcastss_ps:
2310; CHECK:       # %bb.0:
2311; CHECK-NEXT:    vbroadcastss %xmm0, %ymm0
2312; CHECK-NEXT:    ret{{[l|q]}}
2313  %res = shufflevector <4 x float> %a0, <4 x float> undef, <8 x i32> zeroinitializer
2314  ret <8 x float> %res
2315}
2316
2317define <8 x float> @test_mm256_mask_broadcastss_ps(<8 x float> %a0, i8 %a1, <4 x float> %a2) {
2318; X86-LABEL: test_mm256_mask_broadcastss_ps:
2319; X86:       # %bb.0:
2320; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
2321; X86-NEXT:    kmovw %eax, %k1
2322; X86-NEXT:    vbroadcastss %xmm1, %ymm0 {%k1}
2323; X86-NEXT:    retl
2324;
2325; X64-LABEL: test_mm256_mask_broadcastss_ps:
2326; X64:       # %bb.0:
2327; X64-NEXT:    kmovw %edi, %k1
2328; X64-NEXT:    vbroadcastss %xmm1, %ymm0 {%k1}
2329; X64-NEXT:    retq
2330  %arg1 = bitcast i8 %a1 to <8 x i1>
2331  %res0 = shufflevector <4 x float> %a2, <4 x float> undef, <8 x i32> zeroinitializer
2332  %res1 = select <8 x i1> %arg1, <8 x float> %res0, <8 x float> %a0
2333  ret <8 x float> %res1
2334}
2335
2336define <8 x float> @test_mm256_maskz_broadcastss_ps(i8 %a0, <4 x float> %a1) {
2337; X86-LABEL: test_mm256_maskz_broadcastss_ps:
2338; X86:       # %bb.0:
2339; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
2340; X86-NEXT:    kmovw %eax, %k1
2341; X86-NEXT:    vbroadcastss %xmm0, %ymm0 {%k1} {z}
2342; X86-NEXT:    retl
2343;
2344; X64-LABEL: test_mm256_maskz_broadcastss_ps:
2345; X64:       # %bb.0:
2346; X64-NEXT:    kmovw %edi, %k1
2347; X64-NEXT:    vbroadcastss %xmm0, %ymm0 {%k1} {z}
2348; X64-NEXT:    retq
2349  %arg0 = bitcast i8 %a0 to <8 x i1>
2350  %res0 = shufflevector <4 x float> %a1, <4 x float> undef, <8 x i32> zeroinitializer
2351  %res1 = select <8 x i1> %arg0, <8 x float> %res0, <8 x float> zeroinitializer
2352  ret <8 x float> %res1
2353}
2354
2355define <2 x double> @test_mm_movddup_pd(<2 x double> %a0) {
2356; CHECK-LABEL: test_mm_movddup_pd:
2357; CHECK:       # %bb.0:
2358; CHECK-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
2359; CHECK-NEXT:    ret{{[l|q]}}
2360  %res = shufflevector <2 x double> %a0, <2 x double> undef, <2 x i32> zeroinitializer
2361  ret <2 x double> %res
2362}
2363
2364define <2 x double> @test_mm_mask_movedup_pd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A) {
2365; X86-LABEL: test_mm_mask_movedup_pd:
2366; X86:       # %bb.0: # %entry
2367; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
2368; X86-NEXT:    kmovw %eax, %k1
2369; X86-NEXT:    vmovddup {{.*#+}} xmm0 {%k1} = xmm1[0,0]
2370; X86-NEXT:    retl
2371;
2372; X64-LABEL: test_mm_mask_movedup_pd:
2373; X64:       # %bb.0: # %entry
2374; X64-NEXT:    kmovw %edi, %k1
2375; X64-NEXT:    vmovddup {{.*#+}} xmm0 {%k1} = xmm1[0,0]
2376; X64-NEXT:    retq
2377entry:
2378  %shuffle.i.i = shufflevector <2 x double> %__A, <2 x double> undef, <2 x i32> zeroinitializer
2379  %0 = bitcast i8 %__U to <8 x i1>
2380  %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
2381  %1 = select <2 x i1> %extract.i, <2 x double> %shuffle.i.i, <2 x double> %__W
2382  ret <2 x double> %1
2383}
2384
2385define <2 x double> @test_mm_maskz_movedup_pd(i8 zeroext %__U, <2 x double> %__A) {
2386; X86-LABEL: test_mm_maskz_movedup_pd:
2387; X86:       # %bb.0: # %entry
2388; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
2389; X86-NEXT:    kmovw %eax, %k1
2390; X86-NEXT:    vmovddup {{.*#+}} xmm0 {%k1} {z} = xmm0[0,0]
2391; X86-NEXT:    retl
2392;
2393; X64-LABEL: test_mm_maskz_movedup_pd:
2394; X64:       # %bb.0: # %entry
2395; X64-NEXT:    kmovw %edi, %k1
2396; X64-NEXT:    vmovddup {{.*#+}} xmm0 {%k1} {z} = xmm0[0,0]
2397; X64-NEXT:    retq
2398entry:
2399  %shuffle.i.i = shufflevector <2 x double> %__A, <2 x double> undef, <2 x i32> zeroinitializer
2400  %0 = bitcast i8 %__U to <8 x i1>
2401  %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
2402  %1 = select <2 x i1> %extract.i, <2 x double> %shuffle.i.i, <2 x double> zeroinitializer
2403  ret <2 x double> %1
2404}
2405
2406define <4 x double> @test_mm256_movddup_pd(<4 x double> %a0) {
2407; CHECK-LABEL: test_mm256_movddup_pd:
2408; CHECK:       # %bb.0:
2409; CHECK-NEXT:    vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
2410; CHECK-NEXT:    ret{{[l|q]}}
2411  %res = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
2412  ret <4 x double> %res
2413}
2414
2415define <4 x double> @test_mm256_mask_movedup_pd(<4 x double> %__W, i8 zeroext %__U, <4 x double> %__A) {
2416; X86-LABEL: test_mm256_mask_movedup_pd:
2417; X86:       # %bb.0: # %entry
2418; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
2419; X86-NEXT:    kmovw %eax, %k1
2420; X86-NEXT:    vmovddup {{.*#+}} ymm0 {%k1} = ymm1[0,0,2,2]
2421; X86-NEXT:    retl
2422;
2423; X64-LABEL: test_mm256_mask_movedup_pd:
2424; X64:       # %bb.0: # %entry
2425; X64-NEXT:    kmovw %edi, %k1
2426; X64-NEXT:    vmovddup {{.*#+}} ymm0 {%k1} = ymm1[0,0,2,2]
2427; X64-NEXT:    retq
2428entry:
2429  %shuffle.i.i = shufflevector <4 x double> %__A, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
2430  %0 = bitcast i8 %__U to <8 x i1>
2431  %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2432  %1 = select <4 x i1> %extract.i, <4 x double> %shuffle.i.i, <4 x double> %__W
2433  ret <4 x double> %1
2434}
2435
2436define <4 x double> @test_mm256_maskz_movedup_pd(i8 zeroext %__U, <4 x double> %__A) {
2437; X86-LABEL: test_mm256_maskz_movedup_pd:
2438; X86:       # %bb.0: # %entry
2439; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
2440; X86-NEXT:    kmovw %eax, %k1
2441; X86-NEXT:    vmovddup {{.*#+}} ymm0 {%k1} {z} = ymm0[0,0,2,2]
2442; X86-NEXT:    retl
2443;
2444; X64-LABEL: test_mm256_maskz_movedup_pd:
2445; X64:       # %bb.0: # %entry
2446; X64-NEXT:    kmovw %edi, %k1
2447; X64-NEXT:    vmovddup {{.*#+}} ymm0 {%k1} {z} = ymm0[0,0,2,2]
2448; X64-NEXT:    retq
2449entry:
2450  %shuffle.i.i = shufflevector <4 x double> %__A, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
2451  %0 = bitcast i8 %__U to <8 x i1>
2452  %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2453  %1 = select <4 x i1> %extract.i, <4 x double> %shuffle.i.i, <4 x double> zeroinitializer
2454  ret <4 x double> %1
2455}
2456
2457define <4 x float> @test_mm_movehdup_ps(<4 x float> %a0) {
2458; CHECK-LABEL: test_mm_movehdup_ps:
2459; CHECK:       # %bb.0:
2460; CHECK-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
2461; CHECK-NEXT:    ret{{[l|q]}}
2462  %res = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
2463  ret <4 x float> %res
2464}
2465
2466define <4 x float> @test_mm_mask_movehdup_ps(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A) {
2467; X86-LABEL: test_mm_mask_movehdup_ps:
2468; X86:       # %bb.0: # %entry
2469; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
2470; X86-NEXT:    kmovw %eax, %k1
2471; X86-NEXT:    vmovshdup {{.*#+}} xmm0 {%k1} = xmm1[1,1,3,3]
2472; X86-NEXT:    retl
2473;
2474; X64-LABEL: test_mm_mask_movehdup_ps:
2475; X64:       # %bb.0: # %entry
2476; X64-NEXT:    kmovw %edi, %k1
2477; X64-NEXT:    vmovshdup {{.*#+}} xmm0 {%k1} = xmm1[1,1,3,3]
2478; X64-NEXT:    retq
2479entry:
2480  %shuffle.i.i = shufflevector <4 x float> %__A, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
2481  %0 = bitcast i8 %__U to <8 x i1>
2482  %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2483  %1 = select <4 x i1> %extract.i, <4 x float> %shuffle.i.i, <4 x float> %__W
2484  ret <4 x float> %1
2485}
2486
2487define <4 x float> @test_mm_maskz_movehdup_ps(i8 zeroext %__U, <4 x float> %__A) {
2488; X86-LABEL: test_mm_maskz_movehdup_ps:
2489; X86:       # %bb.0: # %entry
2490; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
2491; X86-NEXT:    kmovw %eax, %k1
2492; X86-NEXT:    vmovshdup {{.*#+}} xmm0 {%k1} {z} = xmm0[1,1,3,3]
2493; X86-NEXT:    retl
2494;
2495; X64-LABEL: test_mm_maskz_movehdup_ps:
2496; X64:       # %bb.0: # %entry
2497; X64-NEXT:    kmovw %edi, %k1
2498; X64-NEXT:    vmovshdup {{.*#+}} xmm0 {%k1} {z} = xmm0[1,1,3,3]
2499; X64-NEXT:    retq
2500entry:
2501  %shuffle.i.i = shufflevector <4 x float> %__A, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
2502  %0 = bitcast i8 %__U to <8 x i1>
2503  %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2504  %1 = select <4 x i1> %extract.i, <4 x float> %shuffle.i.i, <4 x float> zeroinitializer
2505  ret <4 x float> %1
2506}
2507
2508define <8 x float> @test_mm256_movehdup_ps(<8 x float> %a0) {
2509; CHECK-LABEL: test_mm256_movehdup_ps:
2510; CHECK:       # %bb.0:
2511; CHECK-NEXT:    vmovshdup {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7]
2512; CHECK-NEXT:    ret{{[l|q]}}
2513  %res = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
2514  ret <8 x float> %res
2515}
2516
2517define <8 x float> @test_mm256_mask_movehdup_ps(<8 x float> %a0, i8 %a1, <8 x float> %a2) {
2518; X86-LABEL: test_mm256_mask_movehdup_ps:
2519; X86:       # %bb.0:
2520; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
2521; X86-NEXT:    kmovw %eax, %k1
2522; X86-NEXT:    vmovshdup {{.*#+}} ymm0 {%k1} = ymm1[1,1,3,3,5,5,7,7]
2523; X86-NEXT:    retl
2524;
2525; X64-LABEL: test_mm256_mask_movehdup_ps:
2526; X64:       # %bb.0:
2527; X64-NEXT:    kmovw %edi, %k1
2528; X64-NEXT:    vmovshdup {{.*#+}} ymm0 {%k1} = ymm1[1,1,3,3,5,5,7,7]
2529; X64-NEXT:    retq
2530  %arg1 = bitcast i8 %a1 to <8 x i1>
2531  %res0 = shufflevector <8 x float> %a2, <8 x float> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
2532  %res1 = select <8 x i1> %arg1, <8 x float> %res0, <8 x float> %a0
2533  ret <8 x float> %res1
2534}
2535
2536define <8 x float> @test_mm256_maskz_movehdup_ps(i8 %a0, <8 x float> %a1) {
2537; X86-LABEL: test_mm256_maskz_movehdup_ps:
2538; X86:       # %bb.0:
2539; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
2540; X86-NEXT:    kmovw %eax, %k1
2541; X86-NEXT:    vmovshdup {{.*#+}} ymm0 {%k1} {z} = ymm0[1,1,3,3,5,5,7,7]
2542; X86-NEXT:    retl
2543;
2544; X64-LABEL: test_mm256_maskz_movehdup_ps:
2545; X64:       # %bb.0:
2546; X64-NEXT:    kmovw %edi, %k1
2547; X64-NEXT:    vmovshdup {{.*#+}} ymm0 {%k1} {z} = ymm0[1,1,3,3,5,5,7,7]
2548; X64-NEXT:    retq
2549  %arg0 = bitcast i8 %a0 to <8 x i1>
2550  %res0 = shufflevector <8 x float> %a1, <8 x float> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
2551  %res1 = select <8 x i1> %arg0, <8 x float> %res0, <8 x float> zeroinitializer
2552  ret <8 x float> %res1
2553}
2554
2555define <4 x float> @test_mm_moveldup_ps(<4 x float> %a0) {
2556; CHECK-LABEL: test_mm_moveldup_ps:
2557; CHECK:       # %bb.0:
2558; CHECK-NEXT:    vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2]
2559; CHECK-NEXT:    ret{{[l|q]}}
2560  %res = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
2561  ret <4 x float> %res
2562}
2563
2564define <4 x float> @test_mm_mask_moveldup_ps(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A) {
2565; X86-LABEL: test_mm_mask_moveldup_ps:
2566; X86:       # %bb.0: # %entry
2567; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
2568; X86-NEXT:    kmovw %eax, %k1
2569; X86-NEXT:    vmovsldup {{.*#+}} xmm0 {%k1} = xmm1[0,0,2,2]
2570; X86-NEXT:    retl
2571;
2572; X64-LABEL: test_mm_mask_moveldup_ps:
2573; X64:       # %bb.0: # %entry
2574; X64-NEXT:    kmovw %edi, %k1
2575; X64-NEXT:    vmovsldup {{.*#+}} xmm0 {%k1} = xmm1[0,0,2,2]
2576; X64-NEXT:    retq
2577entry:
2578  %shuffle.i.i = shufflevector <4 x float> %__A, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
2579  %0 = bitcast i8 %__U to <8 x i1>
2580  %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2581  %1 = select <4 x i1> %extract.i, <4 x float> %shuffle.i.i, <4 x float> %__W
2582  ret <4 x float> %1
2583}
2584
2585define <4 x float> @test_mm_maskz_moveldup_ps(i8 zeroext %__U, <4 x float> %__A) {
2586; X86-LABEL: test_mm_maskz_moveldup_ps:
2587; X86:       # %bb.0: # %entry
2588; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
2589; X86-NEXT:    kmovw %eax, %k1
2590; X86-NEXT:    vmovsldup {{.*#+}} xmm0 {%k1} {z} = xmm0[0,0,2,2]
2591; X86-NEXT:    retl
2592;
2593; X64-LABEL: test_mm_maskz_moveldup_ps:
2594; X64:       # %bb.0: # %entry
2595; X64-NEXT:    kmovw %edi, %k1
2596; X64-NEXT:    vmovsldup {{.*#+}} xmm0 {%k1} {z} = xmm0[0,0,2,2]
2597; X64-NEXT:    retq
2598entry:
2599  %shuffle.i.i = shufflevector <4 x float> %__A, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
2600  %0 = bitcast i8 %__U to <8 x i1>
2601  %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2602  %1 = select <4 x i1> %extract.i, <4 x float> %shuffle.i.i, <4 x float> zeroinitializer
2603  ret <4 x float> %1
2604}
2605
2606define <8 x float> @test_mm256_moveldup_ps(<8 x float> %a0) {
2607; CHECK-LABEL: test_mm256_moveldup_ps:
2608; CHECK:       # %bb.0:
2609; CHECK-NEXT:    vmovsldup {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6]
2610; CHECK-NEXT:    ret{{[l|q]}}
2611  %res = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
2612  ret <8 x float> %res
2613}
2614
2615define <8 x float> @test_mm256_mask_moveldup_ps(<8 x float> %a0, i8 %a1, <8 x float> %a2) {
2616; X86-LABEL: test_mm256_mask_moveldup_ps:
2617; X86:       # %bb.0:
2618; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
2619; X86-NEXT:    kmovw %eax, %k1
2620; X86-NEXT:    vmovsldup {{.*#+}} ymm0 {%k1} = ymm1[0,0,2,2,4,4,6,6]
2621; X86-NEXT:    retl
2622;
2623; X64-LABEL: test_mm256_mask_moveldup_ps:
2624; X64:       # %bb.0:
2625; X64-NEXT:    kmovw %edi, %k1
2626; X64-NEXT:    vmovsldup {{.*#+}} ymm0 {%k1} = ymm1[0,0,2,2,4,4,6,6]
2627; X64-NEXT:    retq
2628  %arg1 = bitcast i8 %a1 to <8 x i1>
2629  %res0 = shufflevector <8 x float> %a2, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
2630  %res1 = select <8 x i1> %arg1, <8 x float> %res0, <8 x float> %a0
2631  ret <8 x float> %res1
2632}
2633
2634define <8 x float> @test_mm256_maskz_moveldup_ps(i8 %a0, <8 x float> %a1) {
2635; X86-LABEL: test_mm256_maskz_moveldup_ps:
2636; X86:       # %bb.0:
2637; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
2638; X86-NEXT:    kmovw %eax, %k1
2639; X86-NEXT:    vmovsldup {{.*#+}} ymm0 {%k1} {z} = ymm0[0,0,2,2,4,4,6,6]
2640; X86-NEXT:    retl
2641;
2642; X64-LABEL: test_mm256_maskz_moveldup_ps:
2643; X64:       # %bb.0:
2644; X64-NEXT:    kmovw %edi, %k1
2645; X64-NEXT:    vmovsldup {{.*#+}} ymm0 {%k1} {z} = ymm0[0,0,2,2,4,4,6,6]
2646; X64-NEXT:    retq
2647  %arg0 = bitcast i8 %a0 to <8 x i1>
2648  %res0 = shufflevector <8 x float> %a1, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
2649  %res1 = select <8 x i1> %arg0, <8 x float> %res0, <8 x float> zeroinitializer
2650  ret <8 x float> %res1
2651}
2652
2653define <4 x i64> @test_mm256_permutex_epi64(<4 x i64> %a0) {
2654; CHECK-LABEL: test_mm256_permutex_epi64:
2655; CHECK:       # %bb.0:
2656; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[3,0,0,0]
2657; CHECK-NEXT:    ret{{[l|q]}}
2658  %res = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> <i32 3, i32 0, i32 0, i32 0>
2659  ret <4 x i64> %res
2660}
2661
2662define <4 x i64> @test_mm256_mask_permutex_epi64(<4 x i64> %__W, i8 zeroext %__M, <4 x i64> %__X) {
2663; X86-LABEL: test_mm256_mask_permutex_epi64:
2664; X86:       # %bb.0: # %entry
2665; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
2666; X86-NEXT:    kmovw %eax, %k1
2667; X86-NEXT:    vpermq {{.*#+}} ymm0 {%k1} = ymm1[3,0,0,0]
2668; X86-NEXT:    retl
2669;
2670; X64-LABEL: test_mm256_mask_permutex_epi64:
2671; X64:       # %bb.0: # %entry
2672; X64-NEXT:    kmovw %edi, %k1
2673; X64-NEXT:    vpermq {{.*#+}} ymm0 {%k1} = ymm1[3,0,0,0]
2674; X64-NEXT:    retq
2675entry:
2676  %perm = shufflevector <4 x i64> %__X, <4 x i64> undef, <4 x i32> <i32 3, i32 0, i32 0, i32 0>
2677  %0 = bitcast i8 %__M to <8 x i1>
2678  %extract = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2679  %1 = select <4 x i1> %extract, <4 x i64> %perm, <4 x i64> %__W
2680  ret <4 x i64> %1
2681}
2682
2683define <4 x i64> @test_mm256_maskz_permutex_epi64(i8 zeroext %__M, <4 x i64> %__X) {
2684; X86-LABEL: test_mm256_maskz_permutex_epi64:
2685; X86:       # %bb.0: # %entry
2686; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
2687; X86-NEXT:    kmovw %eax, %k1
2688; X86-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[3,0,0,0]
2689; X86-NEXT:    retl
2690;
2691; X64-LABEL: test_mm256_maskz_permutex_epi64:
2692; X64:       # %bb.0: # %entry
2693; X64-NEXT:    kmovw %edi, %k1
2694; X64-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[3,0,0,0]
2695; X64-NEXT:    retq
2696entry:
2697  %perm = shufflevector <4 x i64> %__X, <4 x i64> undef, <4 x i32> <i32 3, i32 0, i32 0, i32 0>
2698  %0 = bitcast i8 %__M to <8 x i1>
2699  %extract = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2700  %1 = select <4 x i1> %extract, <4 x i64> %perm, <4 x i64> zeroinitializer
2701  ret <4 x i64> %1
2702}
2703
2704define <4 x double> @test_mm256_permutex_pd(<4 x double> %a0) {
2705; CHECK-LABEL: test_mm256_permutex_pd:
2706; CHECK:       # %bb.0:
2707; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[3,0,0,0]
2708; CHECK-NEXT:    ret{{[l|q]}}
2709  %res = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> <i32 3, i32 0, i32 0, i32 0>
2710  ret <4 x double> %res
2711}
2712
2713define <4 x double> @test_mm256_mask_permutex_pd(<4 x double> %__W, i8 zeroext %__U, <4 x double> %__X) {
2714; X86-LABEL: test_mm256_mask_permutex_pd:
2715; X86:       # %bb.0: # %entry
2716; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
2717; X86-NEXT:    kmovw %eax, %k1
2718; X86-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} = ymm1[1,0,0,0]
2719; X86-NEXT:    retl
2720;
2721; X64-LABEL: test_mm256_mask_permutex_pd:
2722; X64:       # %bb.0: # %entry
2723; X64-NEXT:    kmovw %edi, %k1
2724; X64-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} = ymm1[1,0,0,0]
2725; X64-NEXT:    retq
2726entry:
2727  %perm = shufflevector <4 x double> %__X, <4 x double> undef, <4 x i32> <i32 1, i32 0, i32 0, i32 0>
2728  %0 = bitcast i8 %__U to <8 x i1>
2729  %extract = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2730  %1 = select <4 x i1> %extract, <4 x double> %perm, <4 x double> %__W
2731  ret <4 x double> %1
2732}
2733
2734define <4 x double> @test_mm256_maskz_permutex_pd(i8 zeroext %__U, <4 x double> %__X) {
2735; X86-LABEL: test_mm256_maskz_permutex_pd:
2736; X86:       # %bb.0: # %entry
2737; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
2738; X86-NEXT:    kmovw %eax, %k1
2739; X86-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1,0,0,0]
2740; X86-NEXT:    retl
2741;
2742; X64-LABEL: test_mm256_maskz_permutex_pd:
2743; X64:       # %bb.0: # %entry
2744; X64-NEXT:    kmovw %edi, %k1
2745; X64-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1,0,0,0]
2746; X64-NEXT:    retq
2747entry:
2748  %perm = shufflevector <4 x double> %__X, <4 x double> undef, <4 x i32> <i32 1, i32 0, i32 0, i32 0>
2749  %0 = bitcast i8 %__U to <8 x i1>
2750  %extract = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2751  %1 = select <4 x i1> %extract, <4 x double> %perm, <4 x double> zeroinitializer
2752  ret <4 x double> %1
2753}
2754
2755define <2 x double> @test_mm_shuffle_pd(<2 x double> %a0, <2 x double> %a1) {
2756; CHECK-LABEL: test_mm_shuffle_pd:
2757; CHECK:       # %bb.0:
2758; CHECK-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
2759; CHECK-NEXT:    ret{{[l|q]}}
2760  %res = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> <i32 1, i32 3>
2761  ret <2 x double> %res
2762}
2763
2764define <2 x double> @test_mm_mask_shuffle_pd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
2765; X86-LABEL: test_mm_mask_shuffle_pd:
2766; X86:       # %bb.0: # %entry
2767; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
2768; X86-NEXT:    kmovw %eax, %k1
2769; X86-NEXT:    vunpckhpd {{.*#+}} xmm0 {%k1} = xmm1[1],xmm2[1]
2770; X86-NEXT:    retl
2771;
2772; X64-LABEL: test_mm_mask_shuffle_pd:
2773; X64:       # %bb.0: # %entry
2774; X64-NEXT:    kmovw %edi, %k1
2775; X64-NEXT:    vunpckhpd {{.*#+}} xmm0 {%k1} = xmm1[1],xmm2[1]
2776; X64-NEXT:    retq
2777entry:
2778  %shufp = shufflevector <2 x double> %__A, <2 x double> %__B, <2 x i32> <i32 1, i32 3>
2779  %0 = bitcast i8 %__U to <8 x i1>
2780  %extract = shufflevector <8 x i1> %0, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
2781  %1 = select <2 x i1> %extract, <2 x double> %shufp, <2 x double> %__W
2782  ret <2 x double> %1
2783}
2784
2785define <2 x double> @test_mm_maskz_shuffle_pd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
2786; X86-LABEL: test_mm_maskz_shuffle_pd:
2787; X86:       # %bb.0: # %entry
2788; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
2789; X86-NEXT:    kmovw %eax, %k1
2790; X86-NEXT:    vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],xmm1[1]
2791; X86-NEXT:    retl
2792;
2793; X64-LABEL: test_mm_maskz_shuffle_pd:
2794; X64:       # %bb.0: # %entry
2795; X64-NEXT:    kmovw %edi, %k1
2796; X64-NEXT:    vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],xmm1[1]
2797; X64-NEXT:    retq
2798entry:
2799  %shufp = shufflevector <2 x double> %__A, <2 x double> %__B, <2 x i32> <i32 1, i32 3>
2800  %0 = bitcast i8 %__U to <8 x i1>
2801  %extract = shufflevector <8 x i1> %0, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
2802  %1 = select <2 x i1> %extract, <2 x double> %shufp, <2 x double> zeroinitializer
2803  ret <2 x double> %1
2804}
2805
2806define <4 x double> @test_mm256_shuffle_pd(<4 x double> %a0, <4 x double> %a1) {
2807; CHECK-LABEL: test_mm256_shuffle_pd:
2808; CHECK:       # %bb.0:
2809; CHECK-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[2],ymm1[2]
2810; CHECK-NEXT:    ret{{[l|q]}}
2811  %res = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 1, i32 5, i32 2, i32 6>
2812  ret <4 x double> %res
2813}
2814
2815define <4 x double> @test_mm256_mask_shuffle_pd(<4 x double> %__W, i8 zeroext %__U, <4 x double> %__A, <4 x double> %__B) {
2816; X86-LABEL: test_mm256_mask_shuffle_pd:
2817; X86:       # %bb.0: # %entry
2818; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
2819; X86-NEXT:    kmovw %eax, %k1
2820; X86-NEXT:    vshufpd {{.*#+}} ymm0 {%k1} = ymm1[1],ymm2[1],ymm1[2],ymm2[2]
2821; X86-NEXT:    retl
2822;
2823; X64-LABEL: test_mm256_mask_shuffle_pd:
2824; X64:       # %bb.0: # %entry
2825; X64-NEXT:    kmovw %edi, %k1
2826; X64-NEXT:    vshufpd {{.*#+}} ymm0 {%k1} = ymm1[1],ymm2[1],ymm1[2],ymm2[2]
2827; X64-NEXT:    retq
2828entry:
2829  %shufp = shufflevector <4 x double> %__A, <4 x double> %__B, <4 x i32> <i32 1, i32 5, i32 2, i32 6>
2830  %0 = bitcast i8 %__U to <8 x i1>
2831  %extract = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2832  %1 = select <4 x i1> %extract, <4 x double> %shufp, <4 x double> %__W
2833  ret <4 x double> %1
2834}
2835
2836define <4 x double> @test_mm256_maskz_shuffle_pd(i8 zeroext %__U, <4 x double> %__A, <4 x double> %__B) {
2837; X86-LABEL: test_mm256_maskz_shuffle_pd:
2838; X86:       # %bb.0: # %entry
2839; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
2840; X86-NEXT:    kmovw %eax, %k1
2841; X86-NEXT:    vshufpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[2],ymm1[2]
2842; X86-NEXT:    retl
2843;
2844; X64-LABEL: test_mm256_maskz_shuffle_pd:
2845; X64:       # %bb.0: # %entry
2846; X64-NEXT:    kmovw %edi, %k1
2847; X64-NEXT:    vshufpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[2],ymm1[2]
2848; X64-NEXT:    retq
2849entry:
2850  %shufp = shufflevector <4 x double> %__A, <4 x double> %__B, <4 x i32> <i32 1, i32 5, i32 2, i32 6>
2851  %0 = bitcast i8 %__U to <8 x i1>
2852  %extract = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2853  %1 = select <4 x i1> %extract, <4 x double> %shufp, <4 x double> zeroinitializer
2854  ret <4 x double> %1
2855}
2856
2857define <4 x float> @test_mm_shuffle_ps(<4 x float> %a0, <4 x float> %a1) {
2858; CHECK-LABEL: test_mm_shuffle_ps:
2859; CHECK:       # %bb.0:
2860; CHECK-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0]
2861; CHECK-NEXT:    ret{{[l|q]}}
2862  %res = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 0, i32 1, i32 4, i32 4>
2863  ret <4 x float> %res
2864}
2865
2866define <4 x float> @test_mm_mask_shuffle_ps(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
2867; X86-LABEL: test_mm_mask_shuffle_ps:
2868; X86:       # %bb.0: # %entry
2869; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
2870; X86-NEXT:    kmovw %eax, %k1
2871; X86-NEXT:    vshufps {{.*#+}} xmm0 {%k1} = xmm1[0,1],xmm2[0,0]
2872; X86-NEXT:    retl
2873;
2874; X64-LABEL: test_mm_mask_shuffle_ps:
2875; X64:       # %bb.0: # %entry
2876; X64-NEXT:    kmovw %edi, %k1
2877; X64-NEXT:    vshufps {{.*#+}} xmm0 {%k1} = xmm1[0,1],xmm2[0,0]
2878; X64-NEXT:    retq
2879entry:
2880  %shufp = shufflevector <4 x float> %__A, <4 x float> %__B, <4 x i32> <i32 0, i32 1, i32 4, i32 4>
2881  %0 = bitcast i8 %__U to <8 x i1>
2882  %extract = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2883  %1 = select <4 x i1> %extract, <4 x float> %shufp, <4 x float> %__W
2884  ret <4 x float> %1
2885}
2886
2887define <4 x float> @test_mm_maskz_shuffle_ps(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
2888; X86-LABEL: test_mm_maskz_shuffle_ps:
2889; X86:       # %bb.0: # %entry
2890; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
2891; X86-NEXT:    kmovw %eax, %k1
2892; X86-NEXT:    vshufps {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1],xmm1[0,0]
2893; X86-NEXT:    retl
2894;
2895; X64-LABEL: test_mm_maskz_shuffle_ps:
2896; X64:       # %bb.0: # %entry
2897; X64-NEXT:    kmovw %edi, %k1
2898; X64-NEXT:    vshufps {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1],xmm1[0,0]
2899; X64-NEXT:    retq
2900entry:
2901  %shufp = shufflevector <4 x float> %__A, <4 x float> %__B, <4 x i32> <i32 0, i32 1, i32 4, i32 4>
2902  %0 = bitcast i8 %__U to <8 x i1>
2903  %extract = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2904  %1 = select <4 x i1> %extract, <4 x float> %shufp, <4 x float> zeroinitializer
2905  ret <4 x float> %1
2906}
2907
2908define <8 x float> @test_mm256_shuffle_ps(<8 x float> %a0, <8 x float> %a1) {
2909; CHECK-LABEL: test_mm256_shuffle_ps:
2910; CHECK:       # %bb.0:
2911; CHECK-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,0],ymm0[4,5],ymm1[4,4]
2912; CHECK-NEXT:    ret{{[l|q]}}
2913  %res = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 0, i32 1, i32 8, i32 8, i32 4, i32 5, i32 12, i32 12>
2914  ret <8 x float> %res
2915}
2916
2917define <8 x float> @test_mm256_mask_shuffle_ps(<8 x float> %a0, i8 %a1, <8 x float> %a2, <8 x float> %a3) {
2918; X86-LABEL: test_mm256_mask_shuffle_ps:
2919; X86:       # %bb.0:
2920; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
2921; X86-NEXT:    kmovw %eax, %k1
2922; X86-NEXT:    vshufps {{.*#+}} ymm0 {%k1} = ymm1[0,1],ymm2[0,0],ymm1[4,5],ymm2[4,4]
2923; X86-NEXT:    retl
2924;
2925; X64-LABEL: test_mm256_mask_shuffle_ps:
2926; X64:       # %bb.0:
2927; X64-NEXT:    kmovw %edi, %k1
2928; X64-NEXT:    vshufps {{.*#+}} ymm0 {%k1} = ymm1[0,1],ymm2[0,0],ymm1[4,5],ymm2[4,4]
2929; X64-NEXT:    retq
2930  %arg1 = bitcast i8 %a1 to <8 x i1>
2931  %res0 = shufflevector <8 x float> %a2, <8 x float> %a3, <8 x i32> <i32 0, i32 1, i32 8, i32 8, i32 4, i32 5, i32 12, i32 12>
2932  %res1 = select <8 x i1> %arg1, <8 x float> %res0, <8 x float> %a0
2933  ret <8 x float> %res1
2934}
2935
2936define <8 x float> @test_mm256_maskz_shuffle_ps(i8 %a0, <8 x float> %a1, <8 x float> %a2) {
2937; X86-LABEL: test_mm256_maskz_shuffle_ps:
2938; X86:       # %bb.0:
2939; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
2940; X86-NEXT:    kmovw %eax, %k1
2941; X86-NEXT:    vshufps {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1],ymm1[0,0],ymm0[4,5],ymm1[4,4]
2942; X86-NEXT:    retl
2943;
2944; X64-LABEL: test_mm256_maskz_shuffle_ps:
2945; X64:       # %bb.0:
2946; X64-NEXT:    kmovw %edi, %k1
2947; X64-NEXT:    vshufps {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1],ymm1[0,0],ymm0[4,5],ymm1[4,4]
2948; X64-NEXT:    retq
2949  %arg0 = bitcast i8 %a0 to <8 x i1>
2950  %res0 = shufflevector <8 x float> %a1, <8 x float> %a2, <8 x i32> <i32 0, i32 1, i32 8, i32 8, i32 4, i32 5, i32 12, i32 12>
2951  %res1 = select <8 x i1> %arg0, <8 x float> %res0, <8 x float> zeroinitializer
2952  ret <8 x float> %res1
2953}
2954
2955define <4 x i64> @test_mm256_mask_mul_epi32(<4 x i64> %__W, i8 zeroext %__M, <4 x i64> %__X, <4 x i64> %__Y) nounwind {
2956; X86-LABEL: test_mm256_mask_mul_epi32:
2957; X86:       # %bb.0: # %entry
2958; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
2959; X86-NEXT:    kmovw %eax, %k1
2960; X86-NEXT:    vpmuldq %ymm1, %ymm2, %ymm0 {%k1}
2961; X86-NEXT:    retl
2962;
2963; X64-LABEL: test_mm256_mask_mul_epi32:
2964; X64:       # %bb.0: # %entry
2965; X64-NEXT:    kmovw %edi, %k1
2966; X64-NEXT:    vpmuldq %ymm1, %ymm2, %ymm0 {%k1}
2967; X64-NEXT:    retq
2968entry:
2969  %tmp = shl <4 x i64> %__X, <i64 32, i64 32, i64 32, i64 32>
2970  %tmp1 = ashr exact <4 x i64> %tmp, <i64 32, i64 32, i64 32, i64 32>
2971  %tmp2 = shl <4 x i64> %__Y, <i64 32, i64 32, i64 32, i64 32>
2972  %tmp3 = ashr exact <4 x i64> %tmp2, <i64 32, i64 32, i64 32, i64 32>
2973  %tmp4 = mul nsw <4 x i64> %tmp3, %tmp1
2974  %tmp5 = bitcast i8 %__M to <8 x i1>
2975  %extract.i = shufflevector <8 x i1> %tmp5, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2976  %tmp6 = select <4 x i1> %extract.i, <4 x i64> %tmp4, <4 x i64> %__W
2977  ret <4 x i64> %tmp6
2978}
2979
2980define <4 x i64> @test_mm256_maskz_mul_epi32(i8 zeroext %__M, <4 x i64> %__X, <4 x i64> %__Y) nounwind {
2981; X86-LABEL: test_mm256_maskz_mul_epi32:
2982; X86:       # %bb.0:
2983; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
2984; X86-NEXT:    kmovw %eax, %k1
2985; X86-NEXT:    vpmuldq %ymm0, %ymm1, %ymm0 {%k1} {z}
2986; X86-NEXT:    retl
2987;
2988; X64-LABEL: test_mm256_maskz_mul_epi32:
2989; X64:       # %bb.0:
2990; X64-NEXT:    kmovw %edi, %k1
2991; X64-NEXT:    vpmuldq %ymm0, %ymm1, %ymm0 {%k1} {z}
2992; X64-NEXT:    retq
2993  %tmp = shl <4 x i64> %__X, <i64 32, i64 32, i64 32, i64 32>
2994  %tmp1 = ashr exact <4 x i64> %tmp, <i64 32, i64 32, i64 32, i64 32>
2995  %tmp2 = shl <4 x i64> %__Y, <i64 32, i64 32, i64 32, i64 32>
2996  %tmp3 = ashr exact <4 x i64> %tmp2, <i64 32, i64 32, i64 32, i64 32>
2997  %tmp4 = mul nsw <4 x i64> %tmp3, %tmp1
2998  %tmp5 = bitcast i8 %__M to <8 x i1>
2999  %extract.i = shufflevector <8 x i1> %tmp5, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
3000  %tmp6 = select <4 x i1> %extract.i, <4 x i64> %tmp4, <4 x i64> zeroinitializer
3001  ret <4 x i64> %tmp6
3002}
3003
3004define <2 x i64> @test_mm_mask_mul_epi32(<2 x i64> %__W, i8 zeroext %__M, <2 x i64> %__X, <2 x i64> %__Y) nounwind {
3005; X86-LABEL: test_mm_mask_mul_epi32:
3006; X86:       # %bb.0:
3007; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
3008; X86-NEXT:    kmovw %eax, %k1
3009; X86-NEXT:    vpmuldq %xmm1, %xmm2, %xmm0 {%k1}
3010; X86-NEXT:    retl
3011;
3012; X64-LABEL: test_mm_mask_mul_epi32:
3013; X64:       # %bb.0:
3014; X64-NEXT:    kmovw %edi, %k1
3015; X64-NEXT:    vpmuldq %xmm1, %xmm2, %xmm0 {%k1}
3016; X64-NEXT:    retq
3017  %tmp = shl <2 x i64> %__X, <i64 32, i64 32>
3018  %tmp1 = ashr exact <2 x i64> %tmp, <i64 32, i64 32>
3019  %tmp2 = shl <2 x i64> %__Y, <i64 32, i64 32>
3020  %tmp3 = ashr exact <2 x i64> %tmp2, <i64 32, i64 32>
3021  %tmp4 = mul nsw <2 x i64> %tmp3, %tmp1
3022  %tmp5 = bitcast i8 %__M to <8 x i1>
3023  %extract.i = shufflevector <8 x i1> %tmp5, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
3024  %tmp6 = select <2 x i1> %extract.i, <2 x i64> %tmp4, <2 x i64> %__W
3025  ret <2 x i64> %tmp6
3026}
3027
3028define <2 x i64> @test_mm_maskz_mul_epi32(i8 zeroext %__M, <2 x i64> %__X, <2 x i64> %__Y) nounwind {
3029; X86-LABEL: test_mm_maskz_mul_epi32:
3030; X86:       # %bb.0:
3031; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
3032; X86-NEXT:    kmovw %eax, %k1
3033; X86-NEXT:    vpmuldq %xmm0, %xmm1, %xmm0 {%k1} {z}
3034; X86-NEXT:    retl
3035;
3036; X64-LABEL: test_mm_maskz_mul_epi32:
3037; X64:       # %bb.0:
3038; X64-NEXT:    kmovw %edi, %k1
3039; X64-NEXT:    vpmuldq %xmm0, %xmm1, %xmm0 {%k1} {z}
3040; X64-NEXT:    retq
3041  %tmp = shl <2 x i64> %__X, <i64 32, i64 32>
3042  %tmp1 = ashr exact <2 x i64> %tmp, <i64 32, i64 32>
3043  %tmp2 = shl <2 x i64> %__Y, <i64 32, i64 32>
3044  %tmp3 = ashr exact <2 x i64> %tmp2, <i64 32, i64 32>
3045  %tmp4 = mul nsw <2 x i64> %tmp3, %tmp1
3046  %tmp5 = bitcast i8 %__M to <8 x i1>
3047  %extract.i = shufflevector <8 x i1> %tmp5, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
3048  %tmp6 = select <2 x i1> %extract.i, <2 x i64> %tmp4, <2 x i64> zeroinitializer
3049  ret <2 x i64> %tmp6
3050}
3051
3052define <4 x i64> @test_mm256_mask_mul_epu32(<4 x i64> %__W, i8 zeroext %__M, <4 x i64> %__X, <4 x i64> %__Y) nounwind {
3053; X86-LABEL: test_mm256_mask_mul_epu32:
3054; X86:       # %bb.0: # %entry
3055; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
3056; X86-NEXT:    kmovw %eax, %k1
3057; X86-NEXT:    vpmuludq %ymm1, %ymm2, %ymm0 {%k1}
3058; X86-NEXT:    retl
3059;
3060; X64-LABEL: test_mm256_mask_mul_epu32:
3061; X64:       # %bb.0: # %entry
3062; X64-NEXT:    kmovw %edi, %k1
3063; X64-NEXT:    vpmuludq %ymm1, %ymm2, %ymm0 {%k1}
3064; X64-NEXT:    retq
3065entry:
3066  %tmp = and <4 x i64> %__X, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
3067  %tmp1 = and <4 x i64> %__Y, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
3068  %tmp2 = mul nuw <4 x i64> %tmp1, %tmp
3069  %tmp3 = bitcast i8 %__M to <8 x i1>
3070  %extract.i = shufflevector <8 x i1> %tmp3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
3071  %tmp4 = select <4 x i1> %extract.i, <4 x i64> %tmp2, <4 x i64> %__W
3072  ret <4 x i64> %tmp4
3073}
3074
3075define <4 x i64> @test_mm256_maskz_mul_epu32(i8 zeroext %__M, <4 x i64> %__X, <4 x i64> %__Y) nounwind {
3076; X86-LABEL: test_mm256_maskz_mul_epu32:
3077; X86:       # %bb.0: # %entry
3078; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
3079; X86-NEXT:    kmovw %eax, %k1
3080; X86-NEXT:    vpmuludq %ymm0, %ymm1, %ymm0 {%k1} {z}
3081; X86-NEXT:    retl
3082;
3083; X64-LABEL: test_mm256_maskz_mul_epu32:
3084; X64:       # %bb.0: # %entry
3085; X64-NEXT:    kmovw %edi, %k1
3086; X64-NEXT:    vpmuludq %ymm0, %ymm1, %ymm0 {%k1} {z}
3087; X64-NEXT:    retq
3088entry:
3089  %tmp = and <4 x i64> %__X, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
3090  %tmp1 = and <4 x i64> %__Y, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
3091  %tmp2 = mul nuw <4 x i64> %tmp1, %tmp
3092  %tmp3 = bitcast i8 %__M to <8 x i1>
3093  %extract.i = shufflevector <8 x i1> %tmp3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
3094  %tmp4 = select <4 x i1> %extract.i, <4 x i64> %tmp2, <4 x i64> zeroinitializer
3095  ret <4 x i64> %tmp4
3096}
3097
3098define <2 x i64> @test_mm_mask_mul_epu32(<2 x i64> %__W, i8 zeroext %__M, <2 x i64> %__X, <2 x i64> %__Y) nounwind {
3099; X86-LABEL: test_mm_mask_mul_epu32:
3100; X86:       # %bb.0: # %entry
3101; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
3102; X86-NEXT:    kmovw %eax, %k1
3103; X86-NEXT:    vpmuludq %xmm1, %xmm2, %xmm0 {%k1}
3104; X86-NEXT:    retl
3105;
3106; X64-LABEL: test_mm_mask_mul_epu32:
3107; X64:       # %bb.0: # %entry
3108; X64-NEXT:    kmovw %edi, %k1
3109; X64-NEXT:    vpmuludq %xmm1, %xmm2, %xmm0 {%k1}
3110; X64-NEXT:    retq
3111entry:
3112  %tmp = and <2 x i64> %__X, <i64 4294967295, i64 4294967295>
3113  %tmp1 = and <2 x i64> %__Y, <i64 4294967295, i64 4294967295>
3114  %tmp2 = mul nuw <2 x i64> %tmp1, %tmp
3115  %tmp3 = bitcast i8 %__M to <8 x i1>
3116  %extract.i = shufflevector <8 x i1> %tmp3, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
3117  %tmp4 = select <2 x i1> %extract.i, <2 x i64> %tmp2, <2 x i64> %__W
3118  ret <2 x i64> %tmp4
3119}
3120
3121define <2 x i64> @test_mm_maskz_mul_epu32(i8 zeroext %__M, <2 x i64> %__X, <2 x i64> %__Y) nounwind {
3122; X86-LABEL: test_mm_maskz_mul_epu32:
3123; X86:       # %bb.0: # %entry
3124; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
3125; X86-NEXT:    kmovw %eax, %k1
3126; X86-NEXT:    vpmuludq %xmm0, %xmm1, %xmm0 {%k1} {z}
3127; X86-NEXT:    retl
3128;
3129; X64-LABEL: test_mm_maskz_mul_epu32:
3130; X64:       # %bb.0: # %entry
3131; X64-NEXT:    kmovw %edi, %k1
3132; X64-NEXT:    vpmuludq %xmm0, %xmm1, %xmm0 {%k1} {z}
3133; X64-NEXT:    retq
3134entry:
3135  %tmp = and <2 x i64> %__X, <i64 4294967295, i64 4294967295>
3136  %tmp1 = and <2 x i64> %__Y, <i64 4294967295, i64 4294967295>
3137  %tmp2 = mul nuw <2 x i64> %tmp1, %tmp
3138  %tmp3 = bitcast i8 %__M to <8 x i1>
3139  %extract.i = shufflevector <8 x i1> %tmp3, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
3140  %tmp4 = select <2 x i1> %extract.i, <2 x i64> %tmp2, <2 x i64> zeroinitializer
3141  ret <2 x i64> %tmp4
3142}
3143
3144define <2 x i64> @test_mm_cvtepi32_epi8(<2 x i64> %__A) {
3145; CHECK-LABEL: test_mm_cvtepi32_epi8:
3146; CHECK:       # %bb.0: # %entry
3147; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
3148; CHECK-NEXT:    ret{{[l|q]}}
3149entry:
3150  %0 = bitcast <2 x i64> %__A to <4 x i32>
3151  %conv.i = trunc <4 x i32> %0 to <4 x i8>
3152  %shuf.i = shufflevector <4 x i8> %conv.i, <4 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
3153  %1 = bitcast <16 x i8> %shuf.i to <2 x i64>
3154  ret <2 x i64> %1
3155}
3156
3157define <2 x i64> @test_mm_cvtepi32_epi16(<2 x i64> %__A) {
3158; CHECK-LABEL: test_mm_cvtepi32_epi16:
3159; CHECK:       # %bb.0: # %entry
3160; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
3161; CHECK-NEXT:    ret{{[l|q]}}
3162entry:
3163  %0 = bitcast <2 x i64> %__A to <4 x i32>
3164  %conv.i = trunc <4 x i32> %0 to <4 x i16>
3165  %shuf.i = shufflevector <4 x i16> %conv.i, <4 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
3166  %1 = bitcast <8 x i16> %shuf.i to <2 x i64>
3167  ret <2 x i64> %1
3168}
3169
3170define <2 x i64> @test_mm_cvtepi64_epi8(<2 x i64> %__A) {
3171; CHECK-LABEL: test_mm_cvtepi64_epi8:
3172; CHECK:       # %bb.0: # %entry
3173; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
3174; CHECK-NEXT:    ret{{[l|q]}}
3175entry:
3176  %conv.i = trunc <2 x i64> %__A to <2 x i8>
3177  %shuf.i = shufflevector <2 x i8> %conv.i, <2 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
3178  %0 = bitcast <16 x i8> %shuf.i to <2 x i64>
3179  ret <2 x i64> %0
3180}
3181
3182define <2 x i64> @test_mm_cvtepi64_epi16(<2 x i64> %__A) {
3183; CHECK-LABEL: test_mm_cvtepi64_epi16:
3184; CHECK:       # %bb.0: # %entry
3185; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
3186; CHECK-NEXT:    ret{{[l|q]}}
3187entry:
3188  %conv.i = trunc <2 x i64> %__A to <2 x i16>
3189  %shuf.i = shufflevector <2 x i16> %conv.i, <2 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
3190  %0 = bitcast <8 x i16> %shuf.i to <2 x i64>
3191  ret <2 x i64> %0
3192}
3193
3194define <2 x i64> @test_mm_cvtepi64_epi32(<2 x i64> %__A) {
3195; CHECK-LABEL: test_mm_cvtepi64_epi32:
3196; CHECK:       # %bb.0: # %entry
3197; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
3198; CHECK-NEXT:    ret{{[l|q]}}
3199entry:
3200  %conv.i = trunc <2 x i64> %__A to <2 x i32>
3201  %shuf.i = shufflevector <2 x i32> %conv.i, <2 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
3202  %0 = bitcast <4 x i32> %shuf.i to <2 x i64>
3203  ret <2 x i64> %0
3204}
3205
3206define <2 x i64> @test_mm256_cvtepi32_epi16(<4 x i64> %__A) local_unnamed_addr #0 {
3207; CHECK-LABEL: test_mm256_cvtepi32_epi16:
3208; CHECK:       # %bb.0: # %entry
3209; CHECK-NEXT:    vpmovdw %ymm0, %xmm0
3210; CHECK-NEXT:    vzeroupper
3211; CHECK-NEXT:    ret{{[l|q]}}
3212entry:
3213  %0 = bitcast <4 x i64> %__A to <8 x i32>
3214  %conv.i = trunc <8 x i32> %0 to <8 x i16>
3215  %1 = bitcast <8 x i16> %conv.i to <2 x i64>
3216  ret <2 x i64> %1
3217}
3218
3219define <2 x i64> @test_mm256_mask_cvtepi32_epi16(<2 x i64> %__O, i8 zeroext %__M, <4 x i64> %__A) {
3220; X86-LABEL: test_mm256_mask_cvtepi32_epi16:
3221; X86:       # %bb.0: # %entry
3222; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
3223; X86-NEXT:    kmovw %eax, %k1
3224; X86-NEXT:    vpmovdw %ymm1, %xmm0 {%k1}
3225; X86-NEXT:    vzeroupper
3226; X86-NEXT:    retl
3227;
3228; X64-LABEL: test_mm256_mask_cvtepi32_epi16:
3229; X64:       # %bb.0: # %entry
3230; X64-NEXT:    kmovw %edi, %k1
3231; X64-NEXT:    vpmovdw %ymm1, %xmm0 {%k1}
3232; X64-NEXT:    vzeroupper
3233; X64-NEXT:    retq
3234entry:
3235  %0 = bitcast <4 x i64> %__A to <8 x i32>
3236  %1 = bitcast <2 x i64> %__O to <8 x i16>
3237  %2 = tail call <8 x i16> @llvm.x86.avx512.mask.pmov.dw.256(<8 x i32> %0, <8 x i16> %1, i8 %__M)
3238  %3 = bitcast <8 x i16> %2 to <2 x i64>
3239  ret <2 x i64> %3
3240}
3241
3242define <2 x i64> @test_mm256_maskz_cvtepi32_epi16(i8 zeroext %__M, <4 x i64> %__A) {
3243; X86-LABEL: test_mm256_maskz_cvtepi32_epi16:
3244; X86:       # %bb.0: # %entry
3245; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
3246; X86-NEXT:    kmovw %eax, %k1
3247; X86-NEXT:    vpmovdw %ymm0, %xmm0 {%k1} {z}
3248; X86-NEXT:    vzeroupper
3249; X86-NEXT:    retl
3250;
3251; X64-LABEL: test_mm256_maskz_cvtepi32_epi16:
3252; X64:       # %bb.0: # %entry
3253; X64-NEXT:    kmovw %edi, %k1
3254; X64-NEXT:    vpmovdw %ymm0, %xmm0 {%k1} {z}
3255; X64-NEXT:    vzeroupper
3256; X64-NEXT:    retq
3257entry:
3258  %0 = bitcast <4 x i64> %__A to <8 x i32>
3259  %1 = tail call <8 x i16> @llvm.x86.avx512.mask.pmov.dw.256(<8 x i32> %0, <8 x i16> zeroinitializer, i8 %__M)
3260  %2 = bitcast <8 x i16> %1 to <2 x i64>
3261  ret <2 x i64> %2
3262}
3263
3264define <2 x i64> @test_mm256_cvtepi64_epi32(<4 x i64> %__A) local_unnamed_addr #0 {
3265; CHECK-LABEL: test_mm256_cvtepi64_epi32:
3266; CHECK:       # %bb.0: # %entry
3267; CHECK-NEXT:    vpmovqd %ymm0, %xmm0
3268; CHECK-NEXT:    vzeroupper
3269; CHECK-NEXT:    ret{{[l|q]}}
3270entry:
3271  %conv.i = trunc <4 x i64> %__A to <4 x i32>
3272  %0 = bitcast <4 x i32> %conv.i to <2 x i64>
3273  ret <2 x i64> %0
3274}
3275
3276define <2 x i64> @test_mm256_mask_cvtepi64_epi32(<2 x i64> %__O, i8 zeroext %__M, <4 x i64> %__A) {
3277; X86-LABEL: test_mm256_mask_cvtepi64_epi32:
3278; X86:       # %bb.0: # %entry
3279; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
3280; X86-NEXT:    kmovw %eax, %k1
3281; X86-NEXT:    vpmovqd %ymm1, %xmm0 {%k1}
3282; X86-NEXT:    vzeroupper
3283; X86-NEXT:    retl
3284;
3285; X64-LABEL: test_mm256_mask_cvtepi64_epi32:
3286; X64:       # %bb.0: # %entry
3287; X64-NEXT:    kmovw %edi, %k1
3288; X64-NEXT:    vpmovqd %ymm1, %xmm0 {%k1}
3289; X64-NEXT:    vzeroupper
3290; X64-NEXT:    retq
3291entry:
3292  %conv.i.i = trunc <4 x i64> %__A to <4 x i32>
3293  %0 = bitcast <2 x i64> %__O to <4 x i32>
3294  %1 = bitcast i8 %__M to <8 x i1>
3295  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
3296  %2 = select <4 x i1> %extract.i, <4 x i32> %conv.i.i, <4 x i32> %0
3297  %3 = bitcast <4 x i32> %2 to <2 x i64>
3298  ret <2 x i64> %3
3299}
3300
3301define <2 x i64> @test_mm256_maskz_cvtepi64_epi32(i8 zeroext %__M, <4 x i64> %__A) {
3302; X86-LABEL: test_mm256_maskz_cvtepi64_epi32:
3303; X86:       # %bb.0: # %entry
3304; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
3305; X86-NEXT:    kmovw %eax, %k1
3306; X86-NEXT:    vpmovqd %ymm0, %xmm0 {%k1} {z}
3307; X86-NEXT:    vzeroupper
3308; X86-NEXT:    retl
3309;
3310; X64-LABEL: test_mm256_maskz_cvtepi64_epi32:
3311; X64:       # %bb.0: # %entry
3312; X64-NEXT:    kmovw %edi, %k1
3313; X64-NEXT:    vpmovqd %ymm0, %xmm0 {%k1} {z}
3314; X64-NEXT:    vzeroupper
3315; X64-NEXT:    retq
3316entry:
3317  %conv.i.i = trunc <4 x i64> %__A to <4 x i32>
3318  %0 = bitcast i8 %__M to <8 x i1>
3319  %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
3320  %1 = select <4 x i1> %extract.i, <4 x i32> %conv.i.i, <4 x i32> zeroinitializer
3321  %2 = bitcast <4 x i32> %1 to <2 x i64>
3322  ret <2 x i64> %2
3323}
3324
3325define <2 x i64> @test_mm256_cvtepi64_epi8(<4 x i64> %__A) {
3326; CHECK-LABEL: test_mm256_cvtepi64_epi8:
3327; CHECK:       # %bb.0: # %entry
3328; CHECK-NEXT:    vpmovqb %ymm0, %xmm0
3329; CHECK-NEXT:    vzeroupper
3330; CHECK-NEXT:    ret{{[l|q]}}
3331entry:
3332  %conv.i = trunc <4 x i64> %__A to <4 x i8>
3333  %shuf.i = shufflevector <4 x i8> %conv.i, <4 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
3334  %0 = bitcast <16 x i8> %shuf.i to <2 x i64>
3335  ret <2 x i64> %0
3336}
3337
3338define <2 x i64> @test_mm256_cvtepi64_epi16(<4 x i64> %__A) {
3339; CHECK-LABEL: test_mm256_cvtepi64_epi16:
3340; CHECK:       # %bb.0: # %entry
3341; CHECK-NEXT:    vpmovqw %ymm0, %xmm0
3342; CHECK-NEXT:    vzeroupper
3343; CHECK-NEXT:    ret{{[l|q]}}
3344entry:
3345  %conv.i = trunc <4 x i64> %__A to <4 x i16>
3346  %shuf.i = shufflevector <4 x i16> %conv.i, <4 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
3347  %0 = bitcast <8 x i16> %shuf.i to <2 x i64>
3348  ret <2 x i64> %0
3349}
3350
3351define <2 x i64> @test_mm256_cvtepi32_epi8(<4 x i64> %__A) {
3352; CHECK-LABEL: test_mm256_cvtepi32_epi8:
3353; CHECK:       # %bb.0: # %entry
3354; CHECK-NEXT:    vpmovdb %ymm0, %xmm0
3355; CHECK-NEXT:    vzeroupper
3356; CHECK-NEXT:    ret{{[l|q]}}
3357entry:
3358  %0 = bitcast <4 x i64> %__A to <8 x i32>
3359  %conv.i = trunc <8 x i32> %0 to <8 x i8>
3360  %shuf.i = shufflevector <8 x i8> %conv.i, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
3361  %1 = bitcast <16 x i8> %shuf.i to <2 x i64>
3362  ret <2 x i64> %1
3363}
3364
3365define <2 x i64> @test_mm_ternarylogic_epi32(<2 x i64> %__A, <2 x i64> %__B, <2 x i64> %__C) {
3366; CHECK-LABEL: test_mm_ternarylogic_epi32:
3367; CHECK:       # %bb.0: # %entry
3368; CHECK-NEXT:    vpternlogd $4, %xmm2, %xmm1, %xmm0
3369; CHECK-NEXT:    ret{{[l|q]}}
3370entry:
3371  %0 = bitcast <2 x i64> %__A to <4 x i32>
3372  %1 = bitcast <2 x i64> %__B to <4 x i32>
3373  %2 = bitcast <2 x i64> %__C to <4 x i32>
3374  %3 = tail call <4 x i32> @llvm.x86.avx512.pternlog.d.128(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2, i32 4)
3375  %4 = bitcast <4 x i32> %3 to <2 x i64>
3376  ret <2 x i64> %4
3377}
3378
3379declare <4 x i32> @llvm.x86.avx512.pternlog.d.128(<4 x i32>, <4 x i32>, <4 x i32>, i32) #2
3380
3381define <2 x i64> @test_mm_mask_ternarylogic_epi32(<2 x i64> %__A, i8 zeroext %__U, <2 x i64> %__B, <2 x i64> %__C) {
3382; X86-LABEL: test_mm_mask_ternarylogic_epi32:
3383; X86:       # %bb.0: # %entry
3384; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
3385; X86-NEXT:    kmovw %eax, %k1
3386; X86-NEXT:    vpternlogd $4, %xmm2, %xmm1, %xmm0 {%k1}
3387; X86-NEXT:    retl
3388;
3389; X64-LABEL: test_mm_mask_ternarylogic_epi32:
3390; X64:       # %bb.0: # %entry
3391; X64-NEXT:    kmovw %edi, %k1
3392; X64-NEXT:    vpternlogd $4, %xmm2, %xmm1, %xmm0 {%k1}
3393; X64-NEXT:    retq
3394entry:
3395  %0 = bitcast <2 x i64> %__A to <4 x i32>
3396  %1 = bitcast <2 x i64> %__B to <4 x i32>
3397  %2 = bitcast <2 x i64> %__C to <4 x i32>
3398  %3 = tail call <4 x i32> @llvm.x86.avx512.pternlog.d.128(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2, i32 4)
3399  %4 = bitcast i8 %__U to <8 x i1>
3400  %extract = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
3401  %5 = select <4 x i1> %extract, <4 x i32> %3, <4 x i32> %0
3402  %6 = bitcast <4 x i32> %5 to <2 x i64>
3403  ret <2 x i64> %6
3404}
3405
3406define <2 x i64> @test_mm_maskz_ternarylogic_epi32(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B, <2 x i64> %__C) {
3407; X86-LABEL: test_mm_maskz_ternarylogic_epi32:
3408; X86:       # %bb.0: # %entry
3409; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
3410; X86-NEXT:    kmovw %eax, %k1
3411; X86-NEXT:    vpternlogd $4, %xmm2, %xmm1, %xmm0 {%k1} {z}
3412; X86-NEXT:    retl
3413;
3414; X64-LABEL: test_mm_maskz_ternarylogic_epi32:
3415; X64:       # %bb.0: # %entry
3416; X64-NEXT:    kmovw %edi, %k1
3417; X64-NEXT:    vpternlogd $4, %xmm2, %xmm1, %xmm0 {%k1} {z}
3418; X64-NEXT:    retq
3419entry:
3420  %0 = bitcast <2 x i64> %__A to <4 x i32>
3421  %1 = bitcast <2 x i64> %__B to <4 x i32>
3422  %2 = bitcast <2 x i64> %__C to <4 x i32>
3423  %3 = tail call <4 x i32> @llvm.x86.avx512.pternlog.d.128(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2, i32 4)
3424  %4 = bitcast i8 %__U to <8 x i1>
3425  %extract = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
3426  %5 = select <4 x i1> %extract, <4 x i32> %3, <4 x i32> zeroinitializer
3427  %6 = bitcast <4 x i32> %5 to <2 x i64>
3428  ret <2 x i64> %6
3429}
3430
3431define <4 x i64> @test_mm256_ternarylogic_epi32(<4 x i64> %__A, <4 x i64> %__B, <4 x i64> %__C) {
3432; CHECK-LABEL: test_mm256_ternarylogic_epi32:
3433; CHECK:       # %bb.0: # %entry
3434; CHECK-NEXT:    vpternlogd $4, %ymm2, %ymm1, %ymm0
3435; CHECK-NEXT:    ret{{[l|q]}}
3436entry:
3437  %0 = bitcast <4 x i64> %__A to <8 x i32>
3438  %1 = bitcast <4 x i64> %__B to <8 x i32>
3439  %2 = bitcast <4 x i64> %__C to <8 x i32>
3440  %3 = tail call <8 x i32> @llvm.x86.avx512.pternlog.d.256(<8 x i32> %0, <8 x i32> %1, <8 x i32> %2, i32 4)
3441  %4 = bitcast <8 x i32> %3 to <4 x i64>
3442  ret <4 x i64> %4
3443}
3444
3445declare <8 x i32> @llvm.x86.avx512.pternlog.d.256(<8 x i32>, <8 x i32>, <8 x i32>, i32) #2
3446
3447define <4 x i64> @test_mm256_mask_ternarylogic_epi32(<4 x i64> %__A, i8 zeroext %__U, <4 x i64> %__B, <4 x i64> %__C) {
3448; X86-LABEL: test_mm256_mask_ternarylogic_epi32:
3449; X86:       # %bb.0: # %entry
3450; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
3451; X86-NEXT:    kmovw %eax, %k1
3452; X86-NEXT:    vpternlogd $4, %ymm2, %ymm1, %ymm0 {%k1}
3453; X86-NEXT:    retl
3454;
3455; X64-LABEL: test_mm256_mask_ternarylogic_epi32:
3456; X64:       # %bb.0: # %entry
3457; X64-NEXT:    kmovw %edi, %k1
3458; X64-NEXT:    vpternlogd $4, %ymm2, %ymm1, %ymm0 {%k1}
3459; X64-NEXT:    retq
3460entry:
3461  %0 = bitcast <4 x i64> %__A to <8 x i32>
3462  %1 = bitcast <4 x i64> %__B to <8 x i32>
3463  %2 = bitcast <4 x i64> %__C to <8 x i32>
3464  %3 = tail call <8 x i32> @llvm.x86.avx512.pternlog.d.256(<8 x i32> %0, <8 x i32> %1, <8 x i32> %2, i32 4)
3465  %4 = bitcast i8 %__U to <8 x i1>
3466  %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> %0
3467  %6 = bitcast <8 x i32> %5 to <4 x i64>
3468  ret <4 x i64> %6
3469}
3470
3471define <4 x i64> @test_mm256_maskz_ternarylogic_epi32(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B, <4 x i64> %__C) {
3472; X86-LABEL: test_mm256_maskz_ternarylogic_epi32:
3473; X86:       # %bb.0: # %entry
3474; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
3475; X86-NEXT:    kmovw %eax, %k1
3476; X86-NEXT:    vpternlogd $4, %ymm2, %ymm1, %ymm0 {%k1} {z}
3477; X86-NEXT:    retl
3478;
3479; X64-LABEL: test_mm256_maskz_ternarylogic_epi32:
3480; X64:       # %bb.0: # %entry
3481; X64-NEXT:    kmovw %edi, %k1
3482; X64-NEXT:    vpternlogd $4, %ymm2, %ymm1, %ymm0 {%k1} {z}
3483; X64-NEXT:    retq
3484entry:
3485  %0 = bitcast <4 x i64> %__A to <8 x i32>
3486  %1 = bitcast <4 x i64> %__B to <8 x i32>
3487  %2 = bitcast <4 x i64> %__C to <8 x i32>
3488  %3 = tail call <8 x i32> @llvm.x86.avx512.pternlog.d.256(<8 x i32> %0, <8 x i32> %1, <8 x i32> %2, i32 4)
3489  %4 = bitcast i8 %__U to <8 x i1>
3490  %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> zeroinitializer
3491  %6 = bitcast <8 x i32> %5 to <4 x i64>
3492  ret <4 x i64> %6
3493}
3494
3495define <2 x i64> @test_mm_ternarylogic_epi64(<2 x i64> %__A, <2 x i64> %__B, <2 x i64> %__C) {
3496; CHECK-LABEL: test_mm_ternarylogic_epi64:
3497; CHECK:       # %bb.0: # %entry
3498; CHECK-NEXT:    vpternlogq $4, %xmm2, %xmm1, %xmm0
3499; CHECK-NEXT:    ret{{[l|q]}}
3500entry:
3501  %0 = tail call <2 x i64> @llvm.x86.avx512.pternlog.q.128(<2 x i64> %__A, <2 x i64> %__B, <2 x i64> %__C, i32 4)
3502  ret <2 x i64> %0
3503}
3504
3505declare <2 x i64> @llvm.x86.avx512.pternlog.q.128(<2 x i64>, <2 x i64>, <2 x i64>, i32) #2
3506
3507define <2 x i64> @test_mm_mask_ternarylogic_epi64(<2 x i64> %__A, i8 zeroext %__U, <2 x i64> %__B, <2 x i64> %__C) {
3508; X86-LABEL: test_mm_mask_ternarylogic_epi64:
3509; X86:       # %bb.0: # %entry
3510; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
3511; X86-NEXT:    kmovw %eax, %k1
3512; X86-NEXT:    vpternlogq $4, %xmm2, %xmm1, %xmm0 {%k1}
3513; X86-NEXT:    retl
3514;
3515; X64-LABEL: test_mm_mask_ternarylogic_epi64:
3516; X64:       # %bb.0: # %entry
3517; X64-NEXT:    kmovw %edi, %k1
3518; X64-NEXT:    vpternlogq $4, %xmm2, %xmm1, %xmm0 {%k1}
3519; X64-NEXT:    retq
3520entry:
3521  %0 = tail call <2 x i64> @llvm.x86.avx512.pternlog.q.128(<2 x i64> %__A, <2 x i64> %__B, <2 x i64> %__C, i32 4)
3522  %1 = bitcast i8 %__U to <8 x i1>
3523  %extract = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
3524  %2 = select <2 x i1> %extract, <2 x i64> %0, <2 x i64> %__A
3525  ret <2 x i64> %2
3526}
3527
3528define <2 x i64> @test_mm_maskz_ternarylogic_epi64(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B, <2 x i64> %__C) {
3529; X86-LABEL: test_mm_maskz_ternarylogic_epi64:
3530; X86:       # %bb.0: # %entry
3531; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
3532; X86-NEXT:    kmovw %eax, %k1
3533; X86-NEXT:    vpternlogq $4, %xmm2, %xmm1, %xmm0 {%k1} {z}
3534; X86-NEXT:    retl
3535;
3536; X64-LABEL: test_mm_maskz_ternarylogic_epi64:
3537; X64:       # %bb.0: # %entry
3538; X64-NEXT:    kmovw %edi, %k1
3539; X64-NEXT:    vpternlogq $4, %xmm2, %xmm1, %xmm0 {%k1} {z}
3540; X64-NEXT:    retq
3541entry:
3542  %0 = tail call <2 x i64> @llvm.x86.avx512.pternlog.q.128(<2 x i64> %__A, <2 x i64> %__B, <2 x i64> %__C, i32 4)
3543  %1 = bitcast i8 %__U to <8 x i1>
3544  %extract = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
3545  %2 = select <2 x i1> %extract, <2 x i64> %0, <2 x i64> zeroinitializer
3546  ret <2 x i64> %2
3547}
3548
3549define <4 x i64> @test_mm256_ternarylogic_epi64(<4 x i64> %__A, <4 x i64> %__B, <4 x i64> %__C) {
3550; CHECK-LABEL: test_mm256_ternarylogic_epi64:
3551; CHECK:       # %bb.0: # %entry
3552; CHECK-NEXT:    vpternlogq $4, %ymm2, %ymm1, %ymm0
3553; CHECK-NEXT:    ret{{[l|q]}}
3554entry:
3555  %0 = tail call <4 x i64> @llvm.x86.avx512.pternlog.q.256(<4 x i64> %__A, <4 x i64> %__B, <4 x i64> %__C, i32 4)
3556  ret <4 x i64> %0
3557}
3558
3559declare <4 x i64> @llvm.x86.avx512.pternlog.q.256(<4 x i64>, <4 x i64>, <4 x i64>, i32) #2
3560
3561define <4 x i64> @test_mm256_mask_ternarylogic_epi64(<4 x i64> %__A, i8 zeroext %__U, <4 x i64> %__B, <4 x i64> %__C) {
3562; X86-LABEL: test_mm256_mask_ternarylogic_epi64:
3563; X86:       # %bb.0: # %entry
3564; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
3565; X86-NEXT:    kmovw %eax, %k1
3566; X86-NEXT:    vpternlogq $4, %ymm2, %ymm1, %ymm0 {%k1}
3567; X86-NEXT:    retl
3568;
3569; X64-LABEL: test_mm256_mask_ternarylogic_epi64:
3570; X64:       # %bb.0: # %entry
3571; X64-NEXT:    kmovw %edi, %k1
3572; X64-NEXT:    vpternlogq $4, %ymm2, %ymm1, %ymm0 {%k1}
3573; X64-NEXT:    retq
3574entry:
3575  %0 = tail call <4 x i64> @llvm.x86.avx512.pternlog.q.256(<4 x i64> %__A, <4 x i64> %__B, <4 x i64> %__C, i32 4)
3576  %1 = bitcast i8 %__U to <8 x i1>
3577  %extract = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
3578  %2 = select <4 x i1> %extract, <4 x i64> %0, <4 x i64> %__A
3579  ret <4 x i64> %2
3580}
3581
3582define <4 x i64> @test_mm256_maskz_ternarylogic_epi64(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B, <4 x i64> %__C) {
3583; X86-LABEL: test_mm256_maskz_ternarylogic_epi64:
3584; X86:       # %bb.0: # %entry
3585; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
3586; X86-NEXT:    kmovw %eax, %k1
3587; X86-NEXT:    vpternlogq $4, %ymm2, %ymm1, %ymm0 {%k1} {z}
3588; X86-NEXT:    retl
3589;
3590; X64-LABEL: test_mm256_maskz_ternarylogic_epi64:
3591; X64:       # %bb.0: # %entry
3592; X64-NEXT:    kmovw %edi, %k1
3593; X64-NEXT:    vpternlogq $4, %ymm2, %ymm1, %ymm0 {%k1} {z}
3594; X64-NEXT:    retq
3595entry:
3596  %0 = tail call <4 x i64> @llvm.x86.avx512.pternlog.q.256(<4 x i64> %__A, <4 x i64> %__B, <4 x i64> %__C, i32 4)
3597  %1 = bitcast i8 %__U to <8 x i1>
3598  %extract = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
3599  %2 = select <4 x i1> %extract, <4 x i64> %0, <4 x i64> zeroinitializer
3600  ret <4 x i64> %2
3601}
3602
3603define <2 x i64> @test_mm_mask2_permutex2var_epi32(<2 x i64> %__A, <2 x i64> %__I, i8 zeroext %__U, <2 x i64> %__B) {
3604; X86-LABEL: test_mm_mask2_permutex2var_epi32:
3605; X86:       # %bb.0: # %entry
3606; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
3607; X86-NEXT:    kmovw %eax, %k1
3608; X86-NEXT:    vpermi2d %xmm2, %xmm0, %xmm1 {%k1}
3609; X86-NEXT:    vmovdqa %xmm1, %xmm0
3610; X86-NEXT:    retl
3611;
3612; X64-LABEL: test_mm_mask2_permutex2var_epi32:
3613; X64:       # %bb.0: # %entry
3614; X64-NEXT:    kmovw %edi, %k1
3615; X64-NEXT:    vpermi2d %xmm2, %xmm0, %xmm1 {%k1}
3616; X64-NEXT:    vmovdqa %xmm1, %xmm0
3617; X64-NEXT:    retq
3618entry:
3619  %0 = bitcast <2 x i64> %__A to <4 x i32>
3620  %1 = bitcast <2 x i64> %__I to <4 x i32>
3621  %2 = bitcast <2 x i64> %__B to <4 x i32>
3622  %3 = tail call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2)
3623  %4 = bitcast i8 %__U to <8 x i1>
3624  %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
3625  %5 = select <4 x i1> %extract.i, <4 x i32> %3, <4 x i32> %1
3626  %6 = bitcast <4 x i32> %5 to <2 x i64>
3627  ret <2 x i64> %6
3628}
3629
3630define <4 x i64> @test_mm256_mask2_permutex2var_epi32(<4 x i64> %__A, <4 x i64> %__I, i8 zeroext %__U, <4 x i64> %__B) {
3631; X86-LABEL: test_mm256_mask2_permutex2var_epi32:
3632; X86:       # %bb.0: # %entry
3633; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
3634; X86-NEXT:    kmovw %eax, %k1
3635; X86-NEXT:    vpermi2d %ymm2, %ymm0, %ymm1 {%k1}
3636; X86-NEXT:    vmovdqa %ymm1, %ymm0
3637; X86-NEXT:    retl
3638;
3639; X64-LABEL: test_mm256_mask2_permutex2var_epi32:
3640; X64:       # %bb.0: # %entry
3641; X64-NEXT:    kmovw %edi, %k1
3642; X64-NEXT:    vpermi2d %ymm2, %ymm0, %ymm1 {%k1}
3643; X64-NEXT:    vmovdqa %ymm1, %ymm0
3644; X64-NEXT:    retq
3645entry:
3646  %0 = bitcast <4 x i64> %__A to <8 x i32>
3647  %1 = bitcast <4 x i64> %__I to <8 x i32>
3648  %2 = bitcast <4 x i64> %__B to <8 x i32>
3649  %3 = tail call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> %0, <8 x i32> %1, <8 x i32> %2)
3650  %4 = bitcast i8 %__U to <8 x i1>
3651  %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> %1
3652  %6 = bitcast <8 x i32> %5 to <4 x i64>
3653  ret <4 x i64> %6
3654}
3655
3656define <2 x double> @test_mm_mask2_permutex2var_pd(<2 x double> %__A, <2 x i64> %__I, i8 zeroext %__U, <2 x double> %__B) {
3657; X86-LABEL: test_mm_mask2_permutex2var_pd:
3658; X86:       # %bb.0: # %entry
3659; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
3660; X86-NEXT:    kmovw %eax, %k1
3661; X86-NEXT:    vpermi2pd %xmm2, %xmm0, %xmm1 {%k1}
3662; X86-NEXT:    vmovapd %xmm1, %xmm0
3663; X86-NEXT:    retl
3664;
3665; X64-LABEL: test_mm_mask2_permutex2var_pd:
3666; X64:       # %bb.0: # %entry
3667; X64-NEXT:    kmovw %edi, %k1
3668; X64-NEXT:    vpermi2pd %xmm2, %xmm0, %xmm1 {%k1}
3669; X64-NEXT:    vmovapd %xmm1, %xmm0
3670; X64-NEXT:    retq
3671entry:
3672  %0 = tail call <2 x double> @llvm.x86.avx512.vpermi2var.pd.128(<2 x double> %__A, <2 x i64> %__I, <2 x double> %__B)
3673  %1 = bitcast <2 x i64> %__I to <2 x double>
3674  %2 = bitcast i8 %__U to <8 x i1>
3675  %extract.i = shufflevector <8 x i1> %2, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
3676  %3 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> %1
3677  ret <2 x double> %3
3678}
3679
3680define <4 x double> @test_mm256_mask2_permutex2var_pd(<4 x double> %__A, <4 x i64> %__I, i8 zeroext %__U, <4 x double> %__B) {
3681; X86-LABEL: test_mm256_mask2_permutex2var_pd:
3682; X86:       # %bb.0: # %entry
3683; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
3684; X86-NEXT:    kmovw %eax, %k1
3685; X86-NEXT:    vpermi2pd %ymm2, %ymm0, %ymm1 {%k1}
3686; X86-NEXT:    vmovapd %ymm1, %ymm0
3687; X86-NEXT:    retl
3688;
3689; X64-LABEL: test_mm256_mask2_permutex2var_pd:
3690; X64:       # %bb.0: # %entry
3691; X64-NEXT:    kmovw %edi, %k1
3692; X64-NEXT:    vpermi2pd %ymm2, %ymm0, %ymm1 {%k1}
3693; X64-NEXT:    vmovapd %ymm1, %ymm0
3694; X64-NEXT:    retq
3695entry:
3696  %0 = tail call <4 x double> @llvm.x86.avx512.vpermi2var.pd.256(<4 x double> %__A, <4 x i64> %__I, <4 x double> %__B)
3697  %1 = bitcast <4 x i64> %__I to <4 x double>
3698  %2 = bitcast i8 %__U to <8 x i1>
3699  %extract.i = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
3700  %3 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> %1
3701  ret <4 x double> %3
3702}
3703
3704define <4 x float> @test_mm_mask2_permutex2var_ps(<4 x float> %__A, <2 x i64> %__I, i8 zeroext %__U, <4 x float> %__B) {
3705; X86-LABEL: test_mm_mask2_permutex2var_ps:
3706; X86:       # %bb.0: # %entry
3707; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
3708; X86-NEXT:    kmovw %eax, %k1
3709; X86-NEXT:    vpermi2ps %xmm2, %xmm0, %xmm1 {%k1}
3710; X86-NEXT:    vmovaps %xmm1, %xmm0
3711; X86-NEXT:    retl
3712;
3713; X64-LABEL: test_mm_mask2_permutex2var_ps:
3714; X64:       # %bb.0: # %entry
3715; X64-NEXT:    kmovw %edi, %k1
3716; X64-NEXT:    vpermi2ps %xmm2, %xmm0, %xmm1 {%k1}
3717; X64-NEXT:    vmovaps %xmm1, %xmm0
3718; X64-NEXT:    retq
3719entry:
3720  %0 = bitcast <2 x i64> %__I to <4 x i32>
3721  %1 = tail call <4 x float> @llvm.x86.avx512.vpermi2var.ps.128(<4 x float> %__A, <4 x i32> %0, <4 x float> %__B)
3722  %2 = bitcast <2 x i64> %__I to <4 x float>
3723  %3 = bitcast i8 %__U to <8 x i1>
3724  %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
3725  %4 = select <4 x i1> %extract.i, <4 x float> %1, <4 x float> %2
3726  ret <4 x float> %4
3727}
3728
3729define <8 x float> @test_mm256_mask2_permutex2var_ps(<8 x float> %__A, <4 x i64> %__I, i8 zeroext %__U, <8 x float> %__B) {
3730; X86-LABEL: test_mm256_mask2_permutex2var_ps:
3731; X86:       # %bb.0: # %entry
3732; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
3733; X86-NEXT:    kmovw %eax, %k1
3734; X86-NEXT:    vpermi2ps %ymm2, %ymm0, %ymm1 {%k1}
3735; X86-NEXT:    vmovaps %ymm1, %ymm0
3736; X86-NEXT:    retl
3737;
3738; X64-LABEL: test_mm256_mask2_permutex2var_ps:
3739; X64:       # %bb.0: # %entry
3740; X64-NEXT:    kmovw %edi, %k1
3741; X64-NEXT:    vpermi2ps %ymm2, %ymm0, %ymm1 {%k1}
3742; X64-NEXT:    vmovaps %ymm1, %ymm0
3743; X64-NEXT:    retq
3744entry:
3745  %0 = bitcast <4 x i64> %__I to <8 x i32>
3746  %1 = tail call <8 x float> @llvm.x86.avx512.vpermi2var.ps.256(<8 x float> %__A, <8 x i32> %0, <8 x float> %__B)
3747  %2 = bitcast <4 x i64> %__I to <8 x float>
3748  %3 = bitcast i8 %__U to <8 x i1>
3749  %4 = select <8 x i1> %3, <8 x float> %1, <8 x float> %2
3750  ret <8 x float> %4
3751}
3752
3753define <2 x i64> @test_mm_mask2_permutex2var_epi64(<2 x i64> %__A, <2 x i64> %__I, i8 zeroext %__U, <2 x i64> %__B) {
3754; X86-LABEL: test_mm_mask2_permutex2var_epi64:
3755; X86:       # %bb.0: # %entry
3756; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
3757; X86-NEXT:    kmovw %eax, %k1
3758; X86-NEXT:    vpermi2q %xmm2, %xmm0, %xmm1 {%k1}
3759; X86-NEXT:    vmovdqa %xmm1, %xmm0
3760; X86-NEXT:    retl
3761;
3762; X64-LABEL: test_mm_mask2_permutex2var_epi64:
3763; X64:       # %bb.0: # %entry
3764; X64-NEXT:    kmovw %edi, %k1
3765; X64-NEXT:    vpermi2q %xmm2, %xmm0, %xmm1 {%k1}
3766; X64-NEXT:    vmovdqa %xmm1, %xmm0
3767; X64-NEXT:    retq
3768entry:
3769  %0 = tail call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> %__A, <2 x i64> %__I, <2 x i64> %__B)
3770  %1 = bitcast i8 %__U to <8 x i1>
3771  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
3772  %2 = select <2 x i1> %extract.i, <2 x i64> %0, <2 x i64> %__I
3773  ret <2 x i64> %2
3774}
3775
3776define <4 x i64> @test_mm256_mask2_permutex2var_epi64(<4 x i64> %__A, <4 x i64> %__I, i8 zeroext %__U, <4 x i64> %__B) {
3777; X86-LABEL: test_mm256_mask2_permutex2var_epi64:
3778; X86:       # %bb.0: # %entry
3779; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
3780; X86-NEXT:    kmovw %eax, %k1
3781; X86-NEXT:    vpermi2q %ymm2, %ymm0, %ymm1 {%k1}
3782; X86-NEXT:    vmovdqa %ymm1, %ymm0
3783; X86-NEXT:    retl
3784;
3785; X64-LABEL: test_mm256_mask2_permutex2var_epi64:
3786; X64:       # %bb.0: # %entry
3787; X64-NEXT:    kmovw %edi, %k1
3788; X64-NEXT:    vpermi2q %ymm2, %ymm0, %ymm1 {%k1}
3789; X64-NEXT:    vmovdqa %ymm1, %ymm0
3790; X64-NEXT:    retq
3791entry:
3792  %0 = tail call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> %__A, <4 x i64> %__I, <4 x i64> %__B)
3793  %1 = bitcast i8 %__U to <8 x i1>
3794  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
3795  %2 = select <4 x i1> %extract.i, <4 x i64> %0, <4 x i64> %__I
3796  ret <4 x i64> %2
3797}
3798
3799define <2 x i64> @test_mm_permutex2var_epi32(<2 x i64> %__A, <2 x i64> %__I, <2 x i64> %__B) {
3800; CHECK-LABEL: test_mm_permutex2var_epi32:
3801; CHECK:       # %bb.0: # %entry
3802; CHECK-NEXT:    vpermt2d %xmm2, %xmm1, %xmm0
3803; CHECK-NEXT:    ret{{[l|q]}}
3804entry:
3805  %0 = bitcast <2 x i64> %__A to <4 x i32>
3806  %1 = bitcast <2 x i64> %__I to <4 x i32>
3807  %2 = bitcast <2 x i64> %__B to <4 x i32>
3808  %3 = tail call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2)
3809  %4 = bitcast <4 x i32> %3 to <2 x i64>
3810  ret <2 x i64> %4
3811}
3812
3813define <2 x i64> @test_mm_mask_permutex2var_epi32(<2 x i64> %__A, i8 zeroext %__U, <2 x i64> %__I, <2 x i64> %__B) {
3814; X86-LABEL: test_mm_mask_permutex2var_epi32:
3815; X86:       # %bb.0: # %entry
3816; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
3817; X86-NEXT:    kmovw %eax, %k1
3818; X86-NEXT:    vpermt2d %xmm2, %xmm1, %xmm0 {%k1}
3819; X86-NEXT:    retl
3820;
3821; X64-LABEL: test_mm_mask_permutex2var_epi32:
3822; X64:       # %bb.0: # %entry
3823; X64-NEXT:    kmovw %edi, %k1
3824; X64-NEXT:    vpermt2d %xmm2, %xmm1, %xmm0 {%k1}
3825; X64-NEXT:    retq
3826entry:
3827  %0 = bitcast <2 x i64> %__A to <4 x i32>
3828  %1 = bitcast <2 x i64> %__I to <4 x i32>
3829  %2 = bitcast <2 x i64> %__B to <4 x i32>
3830  %3 = tail call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2)
3831  %4 = bitcast i8 %__U to <8 x i1>
3832  %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
3833  %5 = select <4 x i1> %extract.i, <4 x i32> %3, <4 x i32> %0
3834  %6 = bitcast <4 x i32> %5 to <2 x i64>
3835  ret <2 x i64> %6
3836}
3837
3838define <2 x i64> @test_mm_maskz_permutex2var_epi32(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__I, <2 x i64> %__B) {
3839; X86-LABEL: test_mm_maskz_permutex2var_epi32:
3840; X86:       # %bb.0: # %entry
3841; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
3842; X86-NEXT:    kmovw %eax, %k1
3843; X86-NEXT:    vpermt2d %xmm2, %xmm1, %xmm0 {%k1} {z}
3844; X86-NEXT:    retl
3845;
3846; X64-LABEL: test_mm_maskz_permutex2var_epi32:
3847; X64:       # %bb.0: # %entry
3848; X64-NEXT:    kmovw %edi, %k1
3849; X64-NEXT:    vpermt2d %xmm2, %xmm1, %xmm0 {%k1} {z}
3850; X64-NEXT:    retq
3851entry:
3852  %0 = bitcast <2 x i64> %__A to <4 x i32>
3853  %1 = bitcast <2 x i64> %__I to <4 x i32>
3854  %2 = bitcast <2 x i64> %__B to <4 x i32>
3855  %3 = tail call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2)
3856  %4 = bitcast i8 %__U to <8 x i1>
3857  %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
3858  %5 = select <4 x i1> %extract.i, <4 x i32> %3, <4 x i32> zeroinitializer
3859  %6 = bitcast <4 x i32> %5 to <2 x i64>
3860  ret <2 x i64> %6
3861}
3862
3863define <4 x i64> @test_mm256_permutex2var_epi32(<4 x i64> %__A, <4 x i64> %__I, <4 x i64> %__B) {
3864; CHECK-LABEL: test_mm256_permutex2var_epi32:
3865; CHECK:       # %bb.0: # %entry
3866; CHECK-NEXT:    vpermt2d %ymm2, %ymm1, %ymm0
3867; CHECK-NEXT:    ret{{[l|q]}}
3868entry:
3869  %0 = bitcast <4 x i64> %__A to <8 x i32>
3870  %1 = bitcast <4 x i64> %__I to <8 x i32>
3871  %2 = bitcast <4 x i64> %__B to <8 x i32>
3872  %3 = tail call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> %0, <8 x i32> %1, <8 x i32> %2)
3873  %4 = bitcast <8 x i32> %3 to <4 x i64>
3874  ret <4 x i64> %4
3875}
3876
3877define <4 x i64> @test_mm256_mask_permutex2var_epi32(<4 x i64> %__A, i8 zeroext %__U, <4 x i64> %__I, <4 x i64> %__B) {
3878; X86-LABEL: test_mm256_mask_permutex2var_epi32:
3879; X86:       # %bb.0: # %entry
3880; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
3881; X86-NEXT:    kmovw %eax, %k1
3882; X86-NEXT:    vpermt2d %ymm2, %ymm1, %ymm0 {%k1}
3883; X86-NEXT:    retl
3884;
3885; X64-LABEL: test_mm256_mask_permutex2var_epi32:
3886; X64:       # %bb.0: # %entry
3887; X64-NEXT:    kmovw %edi, %k1
3888; X64-NEXT:    vpermt2d %ymm2, %ymm1, %ymm0 {%k1}
3889; X64-NEXT:    retq
3890entry:
3891  %0 = bitcast <4 x i64> %__A to <8 x i32>
3892  %1 = bitcast <4 x i64> %__I to <8 x i32>
3893  %2 = bitcast <4 x i64> %__B to <8 x i32>
3894  %3 = tail call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> %0, <8 x i32> %1, <8 x i32> %2)
3895  %4 = bitcast i8 %__U to <8 x i1>
3896  %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> %0
3897  %6 = bitcast <8 x i32> %5 to <4 x i64>
3898  ret <4 x i64> %6
3899}
3900
3901define <4 x i64> @test_mm256_maskz_permutex2var_epi32(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__I, <4 x i64> %__B) {
3902; X86-LABEL: test_mm256_maskz_permutex2var_epi32:
3903; X86:       # %bb.0: # %entry
3904; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
3905; X86-NEXT:    kmovw %eax, %k1
3906; X86-NEXT:    vpermt2d %ymm2, %ymm1, %ymm0 {%k1} {z}
3907; X86-NEXT:    retl
3908;
3909; X64-LABEL: test_mm256_maskz_permutex2var_epi32:
3910; X64:       # %bb.0: # %entry
3911; X64-NEXT:    kmovw %edi, %k1
3912; X64-NEXT:    vpermt2d %ymm2, %ymm1, %ymm0 {%k1} {z}
3913; X64-NEXT:    retq
3914entry:
3915  %0 = bitcast <4 x i64> %__A to <8 x i32>
3916  %1 = bitcast <4 x i64> %__I to <8 x i32>
3917  %2 = bitcast <4 x i64> %__B to <8 x i32>
3918  %3 = tail call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> %0, <8 x i32> %1, <8 x i32> %2)
3919  %4 = bitcast i8 %__U to <8 x i1>
3920  %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> zeroinitializer
3921  %6 = bitcast <8 x i32> %5 to <4 x i64>
3922  ret <4 x i64> %6
3923}
3924
3925define <2 x double> @test_mm_permutex2var_pd(<2 x double> %__A, <2 x i64> %__I, <2 x double> %__B) {
3926; CHECK-LABEL: test_mm_permutex2var_pd:
3927; CHECK:       # %bb.0: # %entry
3928; CHECK-NEXT:    vpermt2pd %xmm2, %xmm1, %xmm0
3929; CHECK-NEXT:    ret{{[l|q]}}
3930entry:
3931  %0 = tail call <2 x double> @llvm.x86.avx512.vpermi2var.pd.128(<2 x double> %__A, <2 x i64> %__I, <2 x double> %__B)
3932  ret <2 x double> %0
3933}
3934
3935define <2 x double> @test_mm_mask_permutex2var_pd(<2 x double> %__A, i8 zeroext %__U, <2 x i64> %__I, <2 x double> %__B) {
3936; X86-LABEL: test_mm_mask_permutex2var_pd:
3937; X86:       # %bb.0: # %entry
3938; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
3939; X86-NEXT:    kmovw %eax, %k1
3940; X86-NEXT:    vpermt2pd %xmm2, %xmm1, %xmm0 {%k1}
3941; X86-NEXT:    retl
3942;
3943; X64-LABEL: test_mm_mask_permutex2var_pd:
3944; X64:       # %bb.0: # %entry
3945; X64-NEXT:    kmovw %edi, %k1
3946; X64-NEXT:    vpermt2pd %xmm2, %xmm1, %xmm0 {%k1}
3947; X64-NEXT:    retq
3948entry:
3949  %0 = tail call <2 x double> @llvm.x86.avx512.vpermi2var.pd.128(<2 x double> %__A, <2 x i64> %__I, <2 x double> %__B)
3950  %1 = bitcast i8 %__U to <8 x i1>
3951  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
3952  %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> %__A
3953  ret <2 x double> %2
3954}
3955
3956define <2 x double> @test_mm_maskz_permutex2var_pd(i8 zeroext %__U, <2 x double> %__A, <2 x i64> %__I, <2 x double> %__B) {
3957; X86-LABEL: test_mm_maskz_permutex2var_pd:
3958; X86:       # %bb.0: # %entry
3959; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
3960; X86-NEXT:    kmovw %eax, %k1
3961; X86-NEXT:    vpermt2pd %xmm2, %xmm1, %xmm0 {%k1} {z}
3962; X86-NEXT:    retl
3963;
3964; X64-LABEL: test_mm_maskz_permutex2var_pd:
3965; X64:       # %bb.0: # %entry
3966; X64-NEXT:    kmovw %edi, %k1
3967; X64-NEXT:    vpermt2pd %xmm2, %xmm1, %xmm0 {%k1} {z}
3968; X64-NEXT:    retq
3969entry:
3970  %0 = tail call <2 x double> @llvm.x86.avx512.vpermi2var.pd.128(<2 x double> %__A, <2 x i64> %__I, <2 x double> %__B)
3971  %1 = bitcast i8 %__U to <8 x i1>
3972  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
3973  %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> zeroinitializer
3974  ret <2 x double> %2
3975}
3976
3977define <4 x double> @test_mm256_permutex2var_pd(<4 x double> %__A, <4 x i64> %__I, <4 x double> %__B) {
3978; CHECK-LABEL: test_mm256_permutex2var_pd:
3979; CHECK:       # %bb.0: # %entry
3980; CHECK-NEXT:    vpermt2pd %ymm2, %ymm1, %ymm0
3981; CHECK-NEXT:    ret{{[l|q]}}
3982entry:
3983  %0 = tail call <4 x double> @llvm.x86.avx512.vpermi2var.pd.256(<4 x double> %__A, <4 x i64> %__I, <4 x double> %__B)
3984  ret <4 x double> %0
3985}
3986
3987define <4 x double> @test_mm256_mask_permutex2var_pd(<4 x double> %__A, i8 zeroext %__U, <4 x i64> %__I, <4 x double> %__B) {
3988; X86-LABEL: test_mm256_mask_permutex2var_pd:
3989; X86:       # %bb.0: # %entry
3990; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
3991; X86-NEXT:    kmovw %eax, %k1
3992; X86-NEXT:    vpermt2pd %ymm2, %ymm1, %ymm0 {%k1}
3993; X86-NEXT:    retl
3994;
3995; X64-LABEL: test_mm256_mask_permutex2var_pd:
3996; X64:       # %bb.0: # %entry
3997; X64-NEXT:    kmovw %edi, %k1
3998; X64-NEXT:    vpermt2pd %ymm2, %ymm1, %ymm0 {%k1}
3999; X64-NEXT:    retq
4000entry:
4001  %0 = tail call <4 x double> @llvm.x86.avx512.vpermi2var.pd.256(<4 x double> %__A, <4 x i64> %__I, <4 x double> %__B)
4002  %1 = bitcast i8 %__U to <8 x i1>
4003  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
4004  %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> %__A
4005  ret <4 x double> %2
4006}
4007
4008define <4 x double> @test_mm256_maskz_permutex2var_pd(i8 zeroext %__U, <4 x double> %__A, <4 x i64> %__I, <4 x double> %__B) {
4009; X86-LABEL: test_mm256_maskz_permutex2var_pd:
4010; X86:       # %bb.0: # %entry
4011; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
4012; X86-NEXT:    kmovw %eax, %k1
4013; X86-NEXT:    vpermt2pd %ymm2, %ymm1, %ymm0 {%k1} {z}
4014; X86-NEXT:    retl
4015;
4016; X64-LABEL: test_mm256_maskz_permutex2var_pd:
4017; X64:       # %bb.0: # %entry
4018; X64-NEXT:    kmovw %edi, %k1
4019; X64-NEXT:    vpermt2pd %ymm2, %ymm1, %ymm0 {%k1} {z}
4020; X64-NEXT:    retq
4021entry:
4022  %0 = tail call <4 x double> @llvm.x86.avx512.vpermi2var.pd.256(<4 x double> %__A, <4 x i64> %__I, <4 x double> %__B)
4023  %1 = bitcast i8 %__U to <8 x i1>
4024  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
4025  %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> zeroinitializer
4026  ret <4 x double> %2
4027}
4028
4029define <4 x float> @test_mm_permutex2var_ps(<4 x float> %__A, <2 x i64> %__I, <4 x float> %__B) {
4030; CHECK-LABEL: test_mm_permutex2var_ps:
4031; CHECK:       # %bb.0: # %entry
4032; CHECK-NEXT:    vpermt2ps %xmm2, %xmm1, %xmm0
4033; CHECK-NEXT:    ret{{[l|q]}}
4034entry:
4035  %0 = bitcast <2 x i64> %__I to <4 x i32>
4036  %1 = tail call <4 x float> @llvm.x86.avx512.vpermi2var.ps.128(<4 x float> %__A, <4 x i32> %0, <4 x float> %__B)
4037  ret <4 x float> %1
4038}
4039
4040define <4 x float> @test_mm_mask_permutex2var_ps(<4 x float> %__A, i8 zeroext %__U, <2 x i64> %__I, <4 x float> %__B) {
4041; X86-LABEL: test_mm_mask_permutex2var_ps:
4042; X86:       # %bb.0: # %entry
4043; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
4044; X86-NEXT:    kmovw %eax, %k1
4045; X86-NEXT:    vpermt2ps %xmm2, %xmm1, %xmm0 {%k1}
4046; X86-NEXT:    retl
4047;
4048; X64-LABEL: test_mm_mask_permutex2var_ps:
4049; X64:       # %bb.0: # %entry
4050; X64-NEXT:    kmovw %edi, %k1
4051; X64-NEXT:    vpermt2ps %xmm2, %xmm1, %xmm0 {%k1}
4052; X64-NEXT:    retq
4053entry:
4054  %0 = bitcast <2 x i64> %__I to <4 x i32>
4055  %1 = tail call <4 x float> @llvm.x86.avx512.vpermi2var.ps.128(<4 x float> %__A, <4 x i32> %0, <4 x float> %__B)
4056  %2 = bitcast i8 %__U to <8 x i1>
4057  %extract.i = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
4058  %3 = select <4 x i1> %extract.i, <4 x float> %1, <4 x float> %__A
4059  ret <4 x float> %3
4060}
4061
4062define <4 x float> @test_mm_maskz_permutex2var_ps(i8 zeroext %__U, <4 x float> %__A, <2 x i64> %__I, <4 x float> %__B) {
4063; X86-LABEL: test_mm_maskz_permutex2var_ps:
4064; X86:       # %bb.0: # %entry
4065; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
4066; X86-NEXT:    kmovw %eax, %k1
4067; X86-NEXT:    vpermt2ps %xmm2, %xmm1, %xmm0 {%k1} {z}
4068; X86-NEXT:    retl
4069;
4070; X64-LABEL: test_mm_maskz_permutex2var_ps:
4071; X64:       # %bb.0: # %entry
4072; X64-NEXT:    kmovw %edi, %k1
4073; X64-NEXT:    vpermt2ps %xmm2, %xmm1, %xmm0 {%k1} {z}
4074; X64-NEXT:    retq
4075entry:
4076  %0 = bitcast <2 x i64> %__I to <4 x i32>
4077  %1 = tail call <4 x float> @llvm.x86.avx512.vpermi2var.ps.128(<4 x float> %__A, <4 x i32> %0, <4 x float> %__B)
4078  %2 = bitcast i8 %__U to <8 x i1>
4079  %extract.i = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
4080  %3 = select <4 x i1> %extract.i, <4 x float> %1, <4 x float> zeroinitializer
4081  ret <4 x float> %3
4082}
4083
4084define <8 x float> @test_mm256_permutex2var_ps(<8 x float> %__A, <4 x i64> %__I, <8 x float> %__B) {
4085; CHECK-LABEL: test_mm256_permutex2var_ps:
4086; CHECK:       # %bb.0: # %entry
4087; CHECK-NEXT:    vpermt2ps %ymm2, %ymm1, %ymm0
4088; CHECK-NEXT:    ret{{[l|q]}}
4089entry:
4090  %0 = bitcast <4 x i64> %__I to <8 x i32>
4091  %1 = tail call <8 x float> @llvm.x86.avx512.vpermi2var.ps.256(<8 x float> %__A, <8 x i32> %0, <8 x float> %__B)
4092  ret <8 x float> %1
4093}
4094
4095define <8 x float> @test_mm256_mask_permutex2var_ps(<8 x float> %__A, i8 zeroext %__U, <4 x i64> %__I, <8 x float> %__B) {
4096; X86-LABEL: test_mm256_mask_permutex2var_ps:
4097; X86:       # %bb.0: # %entry
4098; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
4099; X86-NEXT:    kmovw %eax, %k1
4100; X86-NEXT:    vpermt2ps %ymm2, %ymm1, %ymm0 {%k1}
4101; X86-NEXT:    retl
4102;
4103; X64-LABEL: test_mm256_mask_permutex2var_ps:
4104; X64:       # %bb.0: # %entry
4105; X64-NEXT:    kmovw %edi, %k1
4106; X64-NEXT:    vpermt2ps %ymm2, %ymm1, %ymm0 {%k1}
4107; X64-NEXT:    retq
4108entry:
4109  %0 = bitcast <4 x i64> %__I to <8 x i32>
4110  %1 = tail call <8 x float> @llvm.x86.avx512.vpermi2var.ps.256(<8 x float> %__A, <8 x i32> %0, <8 x float> %__B)
4111  %2 = bitcast i8 %__U to <8 x i1>
4112  %3 = select <8 x i1> %2, <8 x float> %1, <8 x float> %__A
4113  ret <8 x float> %3
4114}
4115
4116define <8 x float> @test_mm256_maskz_permutex2var_ps(i8 zeroext %__U, <8 x float> %__A, <4 x i64> %__I, <8 x float> %__B) {
4117; X86-LABEL: test_mm256_maskz_permutex2var_ps:
4118; X86:       # %bb.0: # %entry
4119; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
4120; X86-NEXT:    kmovw %eax, %k1
4121; X86-NEXT:    vpermt2ps %ymm2, %ymm1, %ymm0 {%k1} {z}
4122; X86-NEXT:    retl
4123;
4124; X64-LABEL: test_mm256_maskz_permutex2var_ps:
4125; X64:       # %bb.0: # %entry
4126; X64-NEXT:    kmovw %edi, %k1
4127; X64-NEXT:    vpermt2ps %ymm2, %ymm1, %ymm0 {%k1} {z}
4128; X64-NEXT:    retq
4129entry:
4130  %0 = bitcast <4 x i64> %__I to <8 x i32>
4131  %1 = tail call <8 x float> @llvm.x86.avx512.vpermi2var.ps.256(<8 x float> %__A, <8 x i32> %0, <8 x float> %__B)
4132  %2 = bitcast i8 %__U to <8 x i1>
4133  %3 = select <8 x i1> %2, <8 x float> %1, <8 x float> zeroinitializer
4134  ret <8 x float> %3
4135}
4136
4137define <2 x i64> @test_mm_permutex2var_epi64(<2 x i64> %__A, <2 x i64> %__I, <2 x i64> %__B) {
4138; CHECK-LABEL: test_mm_permutex2var_epi64:
4139; CHECK:       # %bb.0: # %entry
4140; CHECK-NEXT:    vpermt2q %xmm2, %xmm1, %xmm0
4141; CHECK-NEXT:    ret{{[l|q]}}
4142entry:
4143  %0 = tail call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> %__A, <2 x i64> %__I, <2 x i64> %__B)
4144  ret <2 x i64> %0
4145}
4146
4147define <2 x i64> @test_mm_mask_permutex2var_epi64(<2 x i64> %__A, i8 zeroext %__U, <2 x i64> %__I, <2 x i64> %__B) {
4148; X86-LABEL: test_mm_mask_permutex2var_epi64:
4149; X86:       # %bb.0: # %entry
4150; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
4151; X86-NEXT:    kmovw %eax, %k1
4152; X86-NEXT:    vpermt2q %xmm2, %xmm1, %xmm0 {%k1}
4153; X86-NEXT:    retl
4154;
4155; X64-LABEL: test_mm_mask_permutex2var_epi64:
4156; X64:       # %bb.0: # %entry
4157; X64-NEXT:    kmovw %edi, %k1
4158; X64-NEXT:    vpermt2q %xmm2, %xmm1, %xmm0 {%k1}
4159; X64-NEXT:    retq
4160entry:
4161  %0 = tail call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> %__A, <2 x i64> %__I, <2 x i64> %__B)
4162  %1 = bitcast i8 %__U to <8 x i1>
4163  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
4164  %2 = select <2 x i1> %extract.i, <2 x i64> %0, <2 x i64> %__A
4165  ret <2 x i64> %2
4166}
4167
4168define <2 x i64> @test_mm_maskz_permutex2var_epi64(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__I, <2 x i64> %__B) {
4169; X86-LABEL: test_mm_maskz_permutex2var_epi64:
4170; X86:       # %bb.0: # %entry
4171; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
4172; X86-NEXT:    kmovw %eax, %k1
4173; X86-NEXT:    vpermt2q %xmm2, %xmm1, %xmm0 {%k1} {z}
4174; X86-NEXT:    retl
4175;
4176; X64-LABEL: test_mm_maskz_permutex2var_epi64:
4177; X64:       # %bb.0: # %entry
4178; X64-NEXT:    kmovw %edi, %k1
4179; X64-NEXT:    vpermt2q %xmm2, %xmm1, %xmm0 {%k1} {z}
4180; X64-NEXT:    retq
4181entry:
4182  %0 = tail call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> %__A, <2 x i64> %__I, <2 x i64> %__B)
4183  %1 = bitcast i8 %__U to <8 x i1>
4184  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
4185  %2 = select <2 x i1> %extract.i, <2 x i64> %0, <2 x i64> zeroinitializer
4186  ret <2 x i64> %2
4187}
4188
4189define <4 x i64> @test_mm256_permutex2var_epi64(<4 x i64> %__A, <4 x i64> %__I, <4 x i64> %__B) {
4190; CHECK-LABEL: test_mm256_permutex2var_epi64:
4191; CHECK:       # %bb.0: # %entry
4192; CHECK-NEXT:    vpermt2q %ymm2, %ymm1, %ymm0
4193; CHECK-NEXT:    ret{{[l|q]}}
4194entry:
4195  %0 = tail call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> %__A, <4 x i64> %__I, <4 x i64> %__B)
4196  ret <4 x i64> %0
4197}
4198
4199define <4 x i64> @test_mm256_mask_permutex2var_epi64(<4 x i64> %__A, i8 zeroext %__U, <4 x i64> %__I, <4 x i64> %__B) {
4200; X86-LABEL: test_mm256_mask_permutex2var_epi64:
4201; X86:       # %bb.0: # %entry
4202; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
4203; X86-NEXT:    kmovw %eax, %k1
4204; X86-NEXT:    vpermt2q %ymm2, %ymm1, %ymm0 {%k1}
4205; X86-NEXT:    retl
4206;
4207; X64-LABEL: test_mm256_mask_permutex2var_epi64:
4208; X64:       # %bb.0: # %entry
4209; X64-NEXT:    kmovw %edi, %k1
4210; X64-NEXT:    vpermt2q %ymm2, %ymm1, %ymm0 {%k1}
4211; X64-NEXT:    retq
4212entry:
4213  %0 = tail call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> %__A, <4 x i64> %__I, <4 x i64> %__B)
4214  %1 = bitcast i8 %__U to <8 x i1>
4215  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
4216  %2 = select <4 x i1> %extract.i, <4 x i64> %0, <4 x i64> %__A
4217  ret <4 x i64> %2
4218}
4219
4220define <4 x i64> @test_mm256_maskz_permutex2var_epi64(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__I, <4 x i64> %__B) {
4221; X86-LABEL: test_mm256_maskz_permutex2var_epi64:
4222; X86:       # %bb.0: # %entry
4223; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
4224; X86-NEXT:    kmovw %eax, %k1
4225; X86-NEXT:    vpermt2q %ymm2, %ymm1, %ymm0 {%k1} {z}
4226; X86-NEXT:    retl
4227;
4228; X64-LABEL: test_mm256_maskz_permutex2var_epi64:
4229; X64:       # %bb.0: # %entry
4230; X64-NEXT:    kmovw %edi, %k1
4231; X64-NEXT:    vpermt2q %ymm2, %ymm1, %ymm0 {%k1} {z}
4232; X64-NEXT:    retq
4233entry:
4234  %0 = tail call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> %__A, <4 x i64> %__I, <4 x i64> %__B)
4235  %1 = bitcast i8 %__U to <8 x i1>
4236  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
4237  %2 = select <4 x i1> %extract.i, <4 x i64> %0, <4 x i64> zeroinitializer
4238  ret <4 x i64> %2
4239}
4240
4241
4242define <2 x double> @test_mm_mask_fmadd_pd(<2 x double> %__A, i8 zeroext %__U, <2 x double> %__B, <2 x double> %__C) {
4243; X86-LABEL: test_mm_mask_fmadd_pd:
4244; X86:       # %bb.0: # %entry
4245; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
4246; X86-NEXT:    kmovw %eax, %k1
4247; X86-NEXT:    vfmadd132pd {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm2
4248; X86-NEXT:    retl
4249;
4250; X64-LABEL: test_mm_mask_fmadd_pd:
4251; X64:       # %bb.0: # %entry
4252; X64-NEXT:    kmovw %edi, %k1
4253; X64-NEXT:    vfmadd132pd {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm2
4254; X64-NEXT:    retq
4255entry:
4256  %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C) #9
4257  %1 = bitcast i8 %__U to <8 x i1>
4258  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
4259  %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> %__A
4260  ret <2 x double> %2
4261}
4262
4263define <2 x double> @test_mm_mask_fmsub_pd(<2 x double> %__A, i8 zeroext %__U, <2 x double> %__B, <2 x double> %__C) {
4264; X86-LABEL: test_mm_mask_fmsub_pd:
4265; X86:       # %bb.0: # %entry
4266; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
4267; X86-NEXT:    kmovw %eax, %k1
4268; X86-NEXT:    vfmsub132pd {{.*#+}} xmm0 = (xmm0 * xmm1) - xmm2
4269; X86-NEXT:    retl
4270;
4271; X64-LABEL: test_mm_mask_fmsub_pd:
4272; X64:       # %bb.0: # %entry
4273; X64-NEXT:    kmovw %edi, %k1
4274; X64-NEXT:    vfmsub132pd {{.*#+}} xmm0 = (xmm0 * xmm1) - xmm2
4275; X64-NEXT:    retq
4276entry:
4277  %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__C
4278  %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %sub.i) #9
4279  %1 = bitcast i8 %__U to <8 x i1>
4280  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
4281  %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> %__A
4282  ret <2 x double> %2
4283}
4284
4285define <2 x double> @test_mm_mask3_fmadd_pd(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C, i8 zeroext %__U) {
4286; X86-LABEL: test_mm_mask3_fmadd_pd:
4287; X86:       # %bb.0: # %entry
4288; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
4289; X86-NEXT:    kmovw %eax, %k1
4290; X86-NEXT:    vfmadd231pd {{.*#+}} xmm2 = (xmm0 * xmm1) + xmm2
4291; X86-NEXT:    vmovapd %xmm2, %xmm0
4292; X86-NEXT:    retl
4293;
4294; X64-LABEL: test_mm_mask3_fmadd_pd:
4295; X64:       # %bb.0: # %entry
4296; X64-NEXT:    kmovw %edi, %k1
4297; X64-NEXT:    vfmadd231pd {{.*#+}} xmm2 = (xmm0 * xmm1) + xmm2
4298; X64-NEXT:    vmovapd %xmm2, %xmm0
4299; X64-NEXT:    retq
4300entry:
4301  %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C) #9
4302  %1 = bitcast i8 %__U to <8 x i1>
4303  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
4304  %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> %__C
4305  ret <2 x double> %2
4306}
4307
4308define <2 x double> @test_mm_mask3_fnmadd_pd(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C, i8 zeroext %__U) {
4309; X86-LABEL: test_mm_mask3_fnmadd_pd:
4310; X86:       # %bb.0: # %entry
4311; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
4312; X86-NEXT:    kmovw %eax, %k1
4313; X86-NEXT:    vfnmadd231pd {{.*#+}} xmm2 = -(xmm0 * xmm1) + xmm2
4314; X86-NEXT:    vmovapd %xmm2, %xmm0
4315; X86-NEXT:    retl
4316;
4317; X64-LABEL: test_mm_mask3_fnmadd_pd:
4318; X64:       # %bb.0: # %entry
4319; X64-NEXT:    kmovw %edi, %k1
4320; X64-NEXT:    vfnmadd231pd {{.*#+}} xmm2 = -(xmm0 * xmm1) + xmm2
4321; X64-NEXT:    vmovapd %xmm2, %xmm0
4322; X64-NEXT:    retq
4323entry:
4324  %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__A
4325  %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %sub.i, <2 x double> %__B, <2 x double> %__C) #9
4326  %1 = bitcast i8 %__U to <8 x i1>
4327  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
4328  %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> %__C
4329  ret <2 x double> %2
4330}
4331
4332define <2 x double> @test_mm_maskz_fmadd_pd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) {
4333; X86-LABEL: test_mm_maskz_fmadd_pd:
4334; X86:       # %bb.0: # %entry
4335; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
4336; X86-NEXT:    kmovw %eax, %k1
4337; X86-NEXT:    vfmadd213pd {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
4338; X86-NEXT:    retl
4339;
4340; X64-LABEL: test_mm_maskz_fmadd_pd:
4341; X64:       # %bb.0: # %entry
4342; X64-NEXT:    kmovw %edi, %k1
4343; X64-NEXT:    vfmadd213pd {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
4344; X64-NEXT:    retq
4345entry:
4346  %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C) #9
4347  %1 = bitcast i8 %__U to <8 x i1>
4348  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
4349  %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> zeroinitializer
4350  ret <2 x double> %2
4351}
4352
4353define <2 x double> @test_mm_maskz_fmsub_pd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) {
4354; X86-LABEL: test_mm_maskz_fmsub_pd:
4355; X86:       # %bb.0: # %entry
4356; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
4357; X86-NEXT:    kmovw %eax, %k1
4358; X86-NEXT:    vfmsub213pd {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2
4359; X86-NEXT:    retl
4360;
4361; X64-LABEL: test_mm_maskz_fmsub_pd:
4362; X64:       # %bb.0: # %entry
4363; X64-NEXT:    kmovw %edi, %k1
4364; X64-NEXT:    vfmsub213pd {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2
4365; X64-NEXT:    retq
4366entry:
4367  %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__C
4368  %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %sub.i) #9
4369  %1 = bitcast i8 %__U to <8 x i1>
4370  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
4371  %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> zeroinitializer
4372  ret <2 x double> %2
4373}
4374
4375define <2 x double> @test_mm_maskz_fnmadd_pd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) {
4376; X86-LABEL: test_mm_maskz_fnmadd_pd:
4377; X86:       # %bb.0: # %entry
4378; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
4379; X86-NEXT:    kmovw %eax, %k1
4380; X86-NEXT:    vfnmadd213pd {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2
4381; X86-NEXT:    retl
4382;
4383; X64-LABEL: test_mm_maskz_fnmadd_pd:
4384; X64:       # %bb.0: # %entry
4385; X64-NEXT:    kmovw %edi, %k1
4386; X64-NEXT:    vfnmadd213pd {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2
4387; X64-NEXT:    retq
4388entry:
4389  %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__A
4390  %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %sub.i, <2 x double> %__B, <2 x double> %__C) #9
4391  %1 = bitcast i8 %__U to <8 x i1>
4392  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
4393  %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> zeroinitializer
4394  ret <2 x double> %2
4395}
4396
4397define <2 x double> @test_mm_maskz_fnmsub_pd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) {
4398; X86-LABEL: test_mm_maskz_fnmsub_pd:
4399; X86:       # %bb.0: # %entry
4400; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
4401; X86-NEXT:    kmovw %eax, %k1
4402; X86-NEXT:    vfnmsub213pd {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2
4403; X86-NEXT:    retl
4404;
4405; X64-LABEL: test_mm_maskz_fnmsub_pd:
4406; X64:       # %bb.0: # %entry
4407; X64-NEXT:    kmovw %edi, %k1
4408; X64-NEXT:    vfnmsub213pd {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2
4409; X64-NEXT:    retq
4410entry:
4411  %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__A
4412  %sub1.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__C
4413  %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %sub.i, <2 x double> %__B, <2 x double> %sub1.i) #9
4414  %1 = bitcast i8 %__U to <8 x i1>
4415  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
4416  %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> zeroinitializer
4417  ret <2 x double> %2
4418}
4419
4420define <4 x double> @test_mm256_mask_fmadd_pd(<4 x double> %__A, i8 zeroext %__U, <4 x double> %__B, <4 x double> %__C) {
4421; X86-LABEL: test_mm256_mask_fmadd_pd:
4422; X86:       # %bb.0: # %entry
4423; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
4424; X86-NEXT:    kmovw %eax, %k1
4425; X86-NEXT:    vfmadd132pd {{.*#+}} ymm0 = (ymm0 * ymm1) + ymm2
4426; X86-NEXT:    retl
4427;
4428; X64-LABEL: test_mm256_mask_fmadd_pd:
4429; X64:       # %bb.0: # %entry
4430; X64-NEXT:    kmovw %edi, %k1
4431; X64-NEXT:    vfmadd132pd {{.*#+}} ymm0 = (ymm0 * ymm1) + ymm2
4432; X64-NEXT:    retq
4433entry:
4434  %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C) #9
4435  %1 = bitcast i8 %__U to <8 x i1>
4436  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
4437  %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> %__A
4438  ret <4 x double> %2
4439}
4440
4441define <4 x double> @test_mm256_mask_fmsub_pd(<4 x double> %__A, i8 zeroext %__U, <4 x double> %__B, <4 x double> %__C) {
4442; X86-LABEL: test_mm256_mask_fmsub_pd:
4443; X86:       # %bb.0: # %entry
4444; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
4445; X86-NEXT:    kmovw %eax, %k1
4446; X86-NEXT:    vfmsub132pd {{.*#+}} ymm0 = (ymm0 * ymm1) - ymm2
4447; X86-NEXT:    retl
4448;
4449; X64-LABEL: test_mm256_mask_fmsub_pd:
4450; X64:       # %bb.0: # %entry
4451; X64-NEXT:    kmovw %edi, %k1
4452; X64-NEXT:    vfmsub132pd {{.*#+}} ymm0 = (ymm0 * ymm1) - ymm2
4453; X64-NEXT:    retq
4454entry:
4455  %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
4456  %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %sub.i) #9
4457  %1 = bitcast i8 %__U to <8 x i1>
4458  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
4459  %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> %__A
4460  ret <4 x double> %2
4461}
4462
4463define <4 x double> @test_mm256_mask3_fmadd_pd(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C, i8 zeroext %__U) {
4464; X86-LABEL: test_mm256_mask3_fmadd_pd:
4465; X86:       # %bb.0: # %entry
4466; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
4467; X86-NEXT:    kmovw %eax, %k1
4468; X86-NEXT:    vfmadd231pd {{.*#+}} ymm2 = (ymm0 * ymm1) + ymm2
4469; X86-NEXT:    vmovapd %ymm2, %ymm0
4470; X86-NEXT:    retl
4471;
4472; X64-LABEL: test_mm256_mask3_fmadd_pd:
4473; X64:       # %bb.0: # %entry
4474; X64-NEXT:    kmovw %edi, %k1
4475; X64-NEXT:    vfmadd231pd {{.*#+}} ymm2 = (ymm0 * ymm1) + ymm2
4476; X64-NEXT:    vmovapd %ymm2, %ymm0
4477; X64-NEXT:    retq
4478entry:
4479  %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C) #9
4480  %1 = bitcast i8 %__U to <8 x i1>
4481  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
4482  %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> %__C
4483  ret <4 x double> %2
4484}
4485
4486define <4 x double> @test_mm256_mask3_fnmadd_pd(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C, i8 zeroext %__U) {
4487; X86-LABEL: test_mm256_mask3_fnmadd_pd:
4488; X86:       # %bb.0: # %entry
4489; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
4490; X86-NEXT:    kmovw %eax, %k1
4491; X86-NEXT:    vfnmadd231pd {{.*#+}} ymm2 = -(ymm0 * ymm1) + ymm2
4492; X86-NEXT:    vmovapd %ymm2, %ymm0
4493; X86-NEXT:    retl
4494;
4495; X64-LABEL: test_mm256_mask3_fnmadd_pd:
4496; X64:       # %bb.0: # %entry
4497; X64-NEXT:    kmovw %edi, %k1
4498; X64-NEXT:    vfnmadd231pd {{.*#+}} ymm2 = -(ymm0 * ymm1) + ymm2
4499; X64-NEXT:    vmovapd %ymm2, %ymm0
4500; X64-NEXT:    retq
4501entry:
4502  %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A
4503  %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %sub.i, <4 x double> %__B, <4 x double> %__C) #9
4504  %1 = bitcast i8 %__U to <8 x i1>
4505  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
4506  %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> %__C
4507  ret <4 x double> %2
4508}
4509
4510define <4 x double> @test_mm256_maskz_fmadd_pd(i8 zeroext %__U, <4 x double> %__A, <4 x double> %__B, <4 x double> %__C) {
4511; X86-LABEL: test_mm256_maskz_fmadd_pd:
4512; X86:       # %bb.0: # %entry
4513; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
4514; X86-NEXT:    kmovw %eax, %k1
4515; X86-NEXT:    vfmadd213pd {{.*#+}} ymm0 = (ymm1 * ymm0) + ymm2
4516; X86-NEXT:    retl
4517;
4518; X64-LABEL: test_mm256_maskz_fmadd_pd:
4519; X64:       # %bb.0: # %entry
4520; X64-NEXT:    kmovw %edi, %k1
4521; X64-NEXT:    vfmadd213pd {{.*#+}} ymm0 = (ymm1 * ymm0) + ymm2
4522; X64-NEXT:    retq
4523entry:
4524  %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C) #9
4525  %1 = bitcast i8 %__U to <8 x i1>
4526  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
4527  %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> zeroinitializer
4528  ret <4 x double> %2
4529}
4530
4531define <4 x double> @test_mm256_maskz_fmsub_pd(i8 zeroext %__U, <4 x double> %__A, <4 x double> %__B, <4 x double> %__C) {
4532; X86-LABEL: test_mm256_maskz_fmsub_pd:
4533; X86:       # %bb.0: # %entry
4534; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
4535; X86-NEXT:    kmovw %eax, %k1
4536; X86-NEXT:    vfmsub213pd {{.*#+}} ymm0 = (ymm1 * ymm0) - ymm2
4537; X86-NEXT:    retl
4538;
4539; X64-LABEL: test_mm256_maskz_fmsub_pd:
4540; X64:       # %bb.0: # %entry
4541; X64-NEXT:    kmovw %edi, %k1
4542; X64-NEXT:    vfmsub213pd {{.*#+}} ymm0 = (ymm1 * ymm0) - ymm2
4543; X64-NEXT:    retq
4544entry:
4545  %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
4546  %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %sub.i) #9
4547  %1 = bitcast i8 %__U to <8 x i1>
4548  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
4549  %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> zeroinitializer
4550  ret <4 x double> %2
4551}
4552
4553define <4 x double> @test_mm256_maskz_fnmadd_pd(i8 zeroext %__U, <4 x double> %__A, <4 x double> %__B, <4 x double> %__C) {
4554; X86-LABEL: test_mm256_maskz_fnmadd_pd:
4555; X86:       # %bb.0: # %entry
4556; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
4557; X86-NEXT:    kmovw %eax, %k1
4558; X86-NEXT:    vfnmadd213pd {{.*#+}} ymm0 = -(ymm1 * ymm0) + ymm2
4559; X86-NEXT:    retl
4560;
4561; X64-LABEL: test_mm256_maskz_fnmadd_pd:
4562; X64:       # %bb.0: # %entry
4563; X64-NEXT:    kmovw %edi, %k1
4564; X64-NEXT:    vfnmadd213pd {{.*#+}} ymm0 = -(ymm1 * ymm0) + ymm2
4565; X64-NEXT:    retq
4566entry:
4567  %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A
4568  %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %sub.i, <4 x double> %__B, <4 x double> %__C) #9
4569  %1 = bitcast i8 %__U to <8 x i1>
4570  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
4571  %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> zeroinitializer
4572  ret <4 x double> %2
4573}
4574
4575define <4 x double> @test_mm256_maskz_fnmsub_pd(i8 zeroext %__U, <4 x double> %__A, <4 x double> %__B, <4 x double> %__C) {
4576; X86-LABEL: test_mm256_maskz_fnmsub_pd:
4577; X86:       # %bb.0: # %entry
4578; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
4579; X86-NEXT:    kmovw %eax, %k1
4580; X86-NEXT:    vfnmsub213pd {{.*#+}} ymm0 = -(ymm1 * ymm0) - ymm2
4581; X86-NEXT:    retl
4582;
4583; X64-LABEL: test_mm256_maskz_fnmsub_pd:
4584; X64:       # %bb.0: # %entry
4585; X64-NEXT:    kmovw %edi, %k1
4586; X64-NEXT:    vfnmsub213pd {{.*#+}} ymm0 = -(ymm1 * ymm0) - ymm2
4587; X64-NEXT:    retq
4588entry:
4589  %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A
4590  %sub1.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
4591  %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %sub.i, <4 x double> %__B, <4 x double> %sub1.i) #9
4592  %1 = bitcast i8 %__U to <8 x i1>
4593  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
4594  %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> zeroinitializer
4595  ret <4 x double> %2
4596}
4597
4598define <4 x float> @test_mm_mask_fmadd_ps(<4 x float> %__A, i8 zeroext %__U, <4 x float> %__B, <4 x float> %__C) {
4599; X86-LABEL: test_mm_mask_fmadd_ps:
4600; X86:       # %bb.0: # %entry
4601; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
4602; X86-NEXT:    kmovw %eax, %k1
4603; X86-NEXT:    vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm2
4604; X86-NEXT:    retl
4605;
4606; X64-LABEL: test_mm_mask_fmadd_ps:
4607; X64:       # %bb.0: # %entry
4608; X64-NEXT:    kmovw %edi, %k1
4609; X64-NEXT:    vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm2
4610; X64-NEXT:    retq
4611entry:
4612  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C) #9
4613  %1 = bitcast i8 %__U to <8 x i1>
4614  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
4615  %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> %__A
4616  ret <4 x float> %2
4617}
4618
4619define <4 x float> @test_mm_mask_fmsub_ps(<4 x float> %__A, i8 zeroext %__U, <4 x float> %__B, <4 x float> %__C) {
4620; X86-LABEL: test_mm_mask_fmsub_ps:
4621; X86:       # %bb.0: # %entry
4622; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
4623; X86-NEXT:    kmovw %eax, %k1
4624; X86-NEXT:    vfmsub132ps {{.*#+}} xmm0 = (xmm0 * xmm1) - xmm2
4625; X86-NEXT:    retl
4626;
4627; X64-LABEL: test_mm_mask_fmsub_ps:
4628; X64:       # %bb.0: # %entry
4629; X64-NEXT:    kmovw %edi, %k1
4630; X64-NEXT:    vfmsub132ps {{.*#+}} xmm0 = (xmm0 * xmm1) - xmm2
4631; X64-NEXT:    retq
4632entry:
4633  %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4634  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %sub.i) #9
4635  %1 = bitcast i8 %__U to <8 x i1>
4636  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
4637  %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> %__A
4638  ret <4 x float> %2
4639}
4640
4641define <4 x float> @test_mm_mask3_fmadd_ps(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C, i8 zeroext %__U) {
4642; X86-LABEL: test_mm_mask3_fmadd_ps:
4643; X86:       # %bb.0: # %entry
4644; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
4645; X86-NEXT:    kmovw %eax, %k1
4646; X86-NEXT:    vfmadd231ps {{.*#+}} xmm2 = (xmm0 * xmm1) + xmm2
4647; X86-NEXT:    vmovaps %xmm2, %xmm0
4648; X86-NEXT:    retl
4649;
4650; X64-LABEL: test_mm_mask3_fmadd_ps:
4651; X64:       # %bb.0: # %entry
4652; X64-NEXT:    kmovw %edi, %k1
4653; X64-NEXT:    vfmadd231ps {{.*#+}} xmm2 = (xmm0 * xmm1) + xmm2
4654; X64-NEXT:    vmovaps %xmm2, %xmm0
4655; X64-NEXT:    retq
4656entry:
4657  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C) #9
4658  %1 = bitcast i8 %__U to <8 x i1>
4659  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
4660  %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> %__C
4661  ret <4 x float> %2
4662}
4663
4664define <4 x float> @test_mm_mask3_fnmadd_ps(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C, i8 zeroext %__U) {
4665; X86-LABEL: test_mm_mask3_fnmadd_ps:
4666; X86:       # %bb.0: # %entry
4667; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
4668; X86-NEXT:    kmovw %eax, %k1
4669; X86-NEXT:    vfnmadd231ps {{.*#+}} xmm2 = -(xmm0 * xmm1) + xmm2
4670; X86-NEXT:    vmovaps %xmm2, %xmm0
4671; X86-NEXT:    retl
4672;
4673; X64-LABEL: test_mm_mask3_fnmadd_ps:
4674; X64:       # %bb.0: # %entry
4675; X64-NEXT:    kmovw %edi, %k1
4676; X64-NEXT:    vfnmadd231ps {{.*#+}} xmm2 = -(xmm0 * xmm1) + xmm2
4677; X64-NEXT:    vmovaps %xmm2, %xmm0
4678; X64-NEXT:    retq
4679entry:
4680  %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
4681  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %sub.i, <4 x float> %__B, <4 x float> %__C) #9
4682  %1 = bitcast i8 %__U to <8 x i1>
4683  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
4684  %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> %__C
4685  ret <4 x float> %2
4686}
4687
4688define <4 x float> @test_mm_maskz_fmadd_ps(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) {
4689; X86-LABEL: test_mm_maskz_fmadd_ps:
4690; X86:       # %bb.0: # %entry
4691; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
4692; X86-NEXT:    kmovw %eax, %k1
4693; X86-NEXT:    vfmadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
4694; X86-NEXT:    retl
4695;
4696; X64-LABEL: test_mm_maskz_fmadd_ps:
4697; X64:       # %bb.0: # %entry
4698; X64-NEXT:    kmovw %edi, %k1
4699; X64-NEXT:    vfmadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
4700; X64-NEXT:    retq
4701entry:
4702  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C) #9
4703  %1 = bitcast i8 %__U to <8 x i1>
4704  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
4705  %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> zeroinitializer
4706  ret <4 x float> %2
4707}
4708
4709define <4 x float> @test_mm_maskz_fmsub_ps(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) {
4710; X86-LABEL: test_mm_maskz_fmsub_ps:
4711; X86:       # %bb.0: # %entry
4712; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
4713; X86-NEXT:    kmovw %eax, %k1
4714; X86-NEXT:    vfmsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2
4715; X86-NEXT:    retl
4716;
4717; X64-LABEL: test_mm_maskz_fmsub_ps:
4718; X64:       # %bb.0: # %entry
4719; X64-NEXT:    kmovw %edi, %k1
4720; X64-NEXT:    vfmsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2
4721; X64-NEXT:    retq
4722entry:
4723  %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4724  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %sub.i) #9
4725  %1 = bitcast i8 %__U to <8 x i1>
4726  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
4727  %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> zeroinitializer
4728  ret <4 x float> %2
4729}
4730
4731define <4 x float> @test_mm_maskz_fnmadd_ps(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) {
4732; X86-LABEL: test_mm_maskz_fnmadd_ps:
4733; X86:       # %bb.0: # %entry
4734; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
4735; X86-NEXT:    kmovw %eax, %k1
4736; X86-NEXT:    vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2
4737; X86-NEXT:    retl
4738;
4739; X64-LABEL: test_mm_maskz_fnmadd_ps:
4740; X64:       # %bb.0: # %entry
4741; X64-NEXT:    kmovw %edi, %k1
4742; X64-NEXT:    vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2
4743; X64-NEXT:    retq
4744entry:
4745  %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
4746  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %sub.i, <4 x float> %__B, <4 x float> %__C) #9
4747  %1 = bitcast i8 %__U to <8 x i1>
4748  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
4749  %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> zeroinitializer
4750  ret <4 x float> %2
4751}
4752
4753define <4 x float> @test_mm_maskz_fnmsub_ps(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) {
4754; X86-LABEL: test_mm_maskz_fnmsub_ps:
4755; X86:       # %bb.0: # %entry
4756; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
4757; X86-NEXT:    kmovw %eax, %k1
4758; X86-NEXT:    vfnmsub213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2
4759; X86-NEXT:    retl
4760;
4761; X64-LABEL: test_mm_maskz_fnmsub_ps:
4762; X64:       # %bb.0: # %entry
4763; X64-NEXT:    kmovw %edi, %k1
4764; X64-NEXT:    vfnmsub213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2
4765; X64-NEXT:    retq
4766entry:
4767  %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
4768  %sub1.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4769  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %sub.i, <4 x float> %__B, <4 x float> %sub1.i) #9
4770  %1 = bitcast i8 %__U to <8 x i1>
4771  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
4772  %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> zeroinitializer
4773  ret <4 x float> %2
4774}
4775
4776define <8 x float> @test_mm256_mask_fmadd_ps(<8 x float> %__A, i8 zeroext %__U, <8 x float> %__B, <8 x float> %__C) {
4777; X86-LABEL: test_mm256_mask_fmadd_ps:
4778; X86:       # %bb.0: # %entry
4779; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
4780; X86-NEXT:    kmovw %eax, %k1
4781; X86-NEXT:    vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm1) + ymm2
4782; X86-NEXT:    retl
4783;
4784; X64-LABEL: test_mm256_mask_fmadd_ps:
4785; X64:       # %bb.0: # %entry
4786; X64-NEXT:    kmovw %edi, %k1
4787; X64-NEXT:    vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm1) + ymm2
4788; X64-NEXT:    retq
4789entry:
4790  %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C) #9
4791  %1 = bitcast i8 %__U to <8 x i1>
4792  %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> %__A
4793  ret <8 x float> %2
4794}
4795
4796define <8 x float> @test_mm256_mask_fmsub_ps(<8 x float> %__A, i8 zeroext %__U, <8 x float> %__B, <8 x float> %__C) {
4797; X86-LABEL: test_mm256_mask_fmsub_ps:
4798; X86:       # %bb.0: # %entry
4799; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
4800; X86-NEXT:    kmovw %eax, %k1
4801; X86-NEXT:    vfmsub132ps {{.*#+}} ymm0 = (ymm0 * ymm1) - ymm2
4802; X86-NEXT:    retl
4803;
4804; X64-LABEL: test_mm256_mask_fmsub_ps:
4805; X64:       # %bb.0: # %entry
4806; X64-NEXT:    kmovw %edi, %k1
4807; X64-NEXT:    vfmsub132ps {{.*#+}} ymm0 = (ymm0 * ymm1) - ymm2
4808; X64-NEXT:    retq
4809entry:
4810  %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4811  %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %sub.i) #9
4812  %1 = bitcast i8 %__U to <8 x i1>
4813  %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> %__A
4814  ret <8 x float> %2
4815}
4816
4817define <8 x float> @test_mm256_mask3_fmadd_ps(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C, i8 zeroext %__U) {
4818; X86-LABEL: test_mm256_mask3_fmadd_ps:
4819; X86:       # %bb.0: # %entry
4820; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
4821; X86-NEXT:    kmovw %eax, %k1
4822; X86-NEXT:    vfmadd231ps {{.*#+}} ymm2 = (ymm0 * ymm1) + ymm2
4823; X86-NEXT:    vmovaps %ymm2, %ymm0
4824; X86-NEXT:    retl
4825;
4826; X64-LABEL: test_mm256_mask3_fmadd_ps:
4827; X64:       # %bb.0: # %entry
4828; X64-NEXT:    kmovw %edi, %k1
4829; X64-NEXT:    vfmadd231ps {{.*#+}} ymm2 = (ymm0 * ymm1) + ymm2
4830; X64-NEXT:    vmovaps %ymm2, %ymm0
4831; X64-NEXT:    retq
4832entry:
4833  %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C) #9
4834  %1 = bitcast i8 %__U to <8 x i1>
4835  %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> %__C
4836  ret <8 x float> %2
4837}
4838
4839define <8 x float> @test_mm256_mask3_fnmadd_ps(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C, i8 zeroext %__U) {
4840; X86-LABEL: test_mm256_mask3_fnmadd_ps:
4841; X86:       # %bb.0: # %entry
4842; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
4843; X86-NEXT:    kmovw %eax, %k1
4844; X86-NEXT:    vfnmadd231ps {{.*#+}} ymm2 = -(ymm0 * ymm1) + ymm2
4845; X86-NEXT:    vmovaps %ymm2, %ymm0
4846; X86-NEXT:    retl
4847;
4848; X64-LABEL: test_mm256_mask3_fnmadd_ps:
4849; X64:       # %bb.0: # %entry
4850; X64-NEXT:    kmovw %edi, %k1
4851; X64-NEXT:    vfnmadd231ps {{.*#+}} ymm2 = -(ymm0 * ymm1) + ymm2
4852; X64-NEXT:    vmovaps %ymm2, %ymm0
4853; X64-NEXT:    retq
4854entry:
4855  %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
4856  %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %sub.i, <8 x float> %__B, <8 x float> %__C) #9
4857  %1 = bitcast i8 %__U to <8 x i1>
4858  %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> %__C
4859  ret <8 x float> %2
4860}
4861
4862define <8 x float> @test_mm256_maskz_fmadd_ps(i8 zeroext %__U, <8 x float> %__A, <8 x float> %__B, <8 x float> %__C) {
4863; X86-LABEL: test_mm256_maskz_fmadd_ps:
4864; X86:       # %bb.0: # %entry
4865; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
4866; X86-NEXT:    kmovw %eax, %k1
4867; X86-NEXT:    vfmadd213ps {{.*#+}} ymm0 = (ymm1 * ymm0) + ymm2
4868; X86-NEXT:    retl
4869;
4870; X64-LABEL: test_mm256_maskz_fmadd_ps:
4871; X64:       # %bb.0: # %entry
4872; X64-NEXT:    kmovw %edi, %k1
4873; X64-NEXT:    vfmadd213ps {{.*#+}} ymm0 = (ymm1 * ymm0) + ymm2
4874; X64-NEXT:    retq
4875entry:
4876  %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C) #9
4877  %1 = bitcast i8 %__U to <8 x i1>
4878  %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> zeroinitializer
4879  ret <8 x float> %2
4880}
4881
4882define <8 x float> @test_mm256_maskz_fmsub_ps(i8 zeroext %__U, <8 x float> %__A, <8 x float> %__B, <8 x float> %__C) {
4883; X86-LABEL: test_mm256_maskz_fmsub_ps:
4884; X86:       # %bb.0: # %entry
4885; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
4886; X86-NEXT:    kmovw %eax, %k1
4887; X86-NEXT:    vfmsub213ps {{.*#+}} ymm0 = (ymm1 * ymm0) - ymm2
4888; X86-NEXT:    retl
4889;
4890; X64-LABEL: test_mm256_maskz_fmsub_ps:
4891; X64:       # %bb.0: # %entry
4892; X64-NEXT:    kmovw %edi, %k1
4893; X64-NEXT:    vfmsub213ps {{.*#+}} ymm0 = (ymm1 * ymm0) - ymm2
4894; X64-NEXT:    retq
4895entry:
4896  %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4897  %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %sub.i) #9
4898  %1 = bitcast i8 %__U to <8 x i1>
4899  %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> zeroinitializer
4900  ret <8 x float> %2
4901}
4902
4903define <8 x float> @test_mm256_maskz_fnmadd_ps(i8 zeroext %__U, <8 x float> %__A, <8 x float> %__B, <8 x float> %__C) {
4904; X86-LABEL: test_mm256_maskz_fnmadd_ps:
4905; X86:       # %bb.0: # %entry
4906; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
4907; X86-NEXT:    kmovw %eax, %k1
4908; X86-NEXT:    vfnmadd213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) + ymm2
4909; X86-NEXT:    retl
4910;
4911; X64-LABEL: test_mm256_maskz_fnmadd_ps:
4912; X64:       # %bb.0: # %entry
4913; X64-NEXT:    kmovw %edi, %k1
4914; X64-NEXT:    vfnmadd213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) + ymm2
4915; X64-NEXT:    retq
4916entry:
4917  %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
4918  %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %sub.i, <8 x float> %__B, <8 x float> %__C) #9
4919  %1 = bitcast i8 %__U to <8 x i1>
4920  %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> zeroinitializer
4921  ret <8 x float> %2
4922}
4923
4924define <8 x float> @test_mm256_maskz_fnmsub_ps(i8 zeroext %__U, <8 x float> %__A, <8 x float> %__B, <8 x float> %__C) {
4925; X86-LABEL: test_mm256_maskz_fnmsub_ps:
4926; X86:       # %bb.0: # %entry
4927; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
4928; X86-NEXT:    kmovw %eax, %k1
4929; X86-NEXT:    vfnmsub213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) - ymm2
4930; X86-NEXT:    retl
4931;
4932; X64-LABEL: test_mm256_maskz_fnmsub_ps:
4933; X64:       # %bb.0: # %entry
4934; X64-NEXT:    kmovw %edi, %k1
4935; X64-NEXT:    vfnmsub213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) - ymm2
4936; X64-NEXT:    retq
4937entry:
4938  %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
4939  %sub1.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4940  %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %sub.i, <8 x float> %__B, <8 x float> %sub1.i) #9
4941  %1 = bitcast i8 %__U to <8 x i1>
4942  %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> zeroinitializer
4943  ret <8 x float> %2
4944}
4945
4946define <2 x double> @test_mm_mask_fmaddsub_pd(<2 x double> %__A, i8 zeroext %__U, <2 x double> %__B, <2 x double> %__C) {
4947; X86-LABEL: test_mm_mask_fmaddsub_pd:
4948; X86:       # %bb.0: # %entry
4949; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
4950; X86-NEXT:    kmovw %eax, %k1
4951; X86-NEXT:    vfmaddsub132pd {{.*#+}} xmm0 = (xmm0 * xmm1) +/- xmm2
4952; X86-NEXT:    retl
4953;
4954; X64-LABEL: test_mm_mask_fmaddsub_pd:
4955; X64:       # %bb.0: # %entry
4956; X64-NEXT:    kmovw %edi, %k1
4957; X64-NEXT:    vfmaddsub132pd {{.*#+}} xmm0 = (xmm0 * xmm1) +/- xmm2
4958; X64-NEXT:    retq
4959entry:
4960  %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C) #9
4961  %1 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__C
4962  %2 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %1) #9
4963  %3 = shufflevector <2 x double> %2, <2 x double> %0, <2 x i32> <i32 0, i32 3>
4964  %4 = bitcast i8 %__U to <8 x i1>
4965  %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
4966  %5 = select <2 x i1> %extract.i, <2 x double> %3, <2 x double> %__A
4967  ret <2 x double> %5
4968}
4969
4970define <2 x double> @test_mm_mask_fmsubadd_pd(<2 x double> %__A, i8 zeroext %__U, <2 x double> %__B, <2 x double> %__C) {
4971; X86-LABEL: test_mm_mask_fmsubadd_pd:
4972; X86:       # %bb.0: # %entry
4973; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
4974; X86-NEXT:    kmovw %eax, %k1
4975; X86-NEXT:    vfmsubadd132pd {{.*#+}} xmm0 = (xmm0 * xmm1) -/+ xmm2
4976; X86-NEXT:    retl
4977;
4978; X64-LABEL: test_mm_mask_fmsubadd_pd:
4979; X64:       # %bb.0: # %entry
4980; X64-NEXT:    kmovw %edi, %k1
4981; X64-NEXT:    vfmsubadd132pd {{.*#+}} xmm0 = (xmm0 * xmm1) -/+ xmm2
4982; X64-NEXT:    retq
4983entry:
4984  %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__C
4985  %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %sub.i) #9
4986  %1 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C) #9
4987  %2 = shufflevector <2 x double> %1, <2 x double> %0, <2 x i32> <i32 0, i32 3>
4988  %3 = bitcast i8 %__U to <8 x i1>
4989  %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
4990  %4 = select <2 x i1> %extract.i, <2 x double> %2, <2 x double> %__A
4991  ret <2 x double> %4
4992}
4993
4994define <2 x double> @test_mm_mask3_fmaddsub_pd(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C, i8 zeroext %__U) {
4995; X86-LABEL: test_mm_mask3_fmaddsub_pd:
4996; X86:       # %bb.0: # %entry
4997; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
4998; X86-NEXT:    kmovw %eax, %k1
4999; X86-NEXT:    vfmaddsub231pd {{.*#+}} xmm2 = (xmm0 * xmm1) +/- xmm2
5000; X86-NEXT:    vmovapd %xmm2, %xmm0
5001; X86-NEXT:    retl
5002;
5003; X64-LABEL: test_mm_mask3_fmaddsub_pd:
5004; X64:       # %bb.0: # %entry
5005; X64-NEXT:    kmovw %edi, %k1
5006; X64-NEXT:    vfmaddsub231pd {{.*#+}} xmm2 = (xmm0 * xmm1) +/- xmm2
5007; X64-NEXT:    vmovapd %xmm2, %xmm0
5008; X64-NEXT:    retq
5009entry:
5010  %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C) #9
5011  %1 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__C
5012  %2 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %1) #9
5013  %3 = shufflevector <2 x double> %2, <2 x double> %0, <2 x i32> <i32 0, i32 3>
5014  %4 = bitcast i8 %__U to <8 x i1>
5015  %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
5016  %5 = select <2 x i1> %extract.i, <2 x double> %3, <2 x double> %__C
5017  ret <2 x double> %5
5018}
5019
5020define <2 x double> @test_mm_maskz_fmaddsub_pd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) {
5021; X86-LABEL: test_mm_maskz_fmaddsub_pd:
5022; X86:       # %bb.0: # %entry
5023; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
5024; X86-NEXT:    kmovw %eax, %k1
5025; X86-NEXT:    vfmaddsub213pd {{.*#+}} xmm0 = (xmm1 * xmm0) +/- xmm2
5026; X86-NEXT:    retl
5027;
5028; X64-LABEL: test_mm_maskz_fmaddsub_pd:
5029; X64:       # %bb.0: # %entry
5030; X64-NEXT:    kmovw %edi, %k1
5031; X64-NEXT:    vfmaddsub213pd {{.*#+}} xmm0 = (xmm1 * xmm0) +/- xmm2
5032; X64-NEXT:    retq
5033entry:
5034  %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C) #9
5035  %1 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__C
5036  %2 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %1) #9
5037  %3 = shufflevector <2 x double> %2, <2 x double> %0, <2 x i32> <i32 0, i32 3>
5038  %4 = bitcast i8 %__U to <8 x i1>
5039  %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
5040  %5 = select <2 x i1> %extract.i, <2 x double> %3, <2 x double> zeroinitializer
5041  ret <2 x double> %5
5042}
5043
5044define <2 x double> @test_mm_maskz_fmsubadd_pd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) {
5045; X86-LABEL: test_mm_maskz_fmsubadd_pd:
5046; X86:       # %bb.0: # %entry
5047; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
5048; X86-NEXT:    kmovw %eax, %k1
5049; X86-NEXT:    vfmsubadd213pd {{.*#+}} xmm0 = (xmm1 * xmm0) -/+ xmm2
5050; X86-NEXT:    retl
5051;
5052; X64-LABEL: test_mm_maskz_fmsubadd_pd:
5053; X64:       # %bb.0: # %entry
5054; X64-NEXT:    kmovw %edi, %k1
5055; X64-NEXT:    vfmsubadd213pd {{.*#+}} xmm0 = (xmm1 * xmm0) -/+ xmm2
5056; X64-NEXT:    retq
5057entry:
5058  %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__C
5059  %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %sub.i) #9
5060  %1 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C) #9
5061  %2 = shufflevector <2 x double> %1, <2 x double> %0, <2 x i32> <i32 0, i32 3>
5062  %3 = bitcast i8 %__U to <8 x i1>
5063  %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
5064  %4 = select <2 x i1> %extract.i, <2 x double> %2, <2 x double> zeroinitializer
5065  ret <2 x double> %4
5066}
5067
5068define <4 x double> @test_mm256_mask_fmaddsub_pd(<4 x double> %__A, i8 zeroext %__U, <4 x double> %__B, <4 x double> %__C) {
5069; X86-LABEL: test_mm256_mask_fmaddsub_pd:
5070; X86:       # %bb.0: # %entry
5071; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
5072; X86-NEXT:    kmovw %eax, %k1
5073; X86-NEXT:    vfmaddsub132pd {{.*#+}} ymm0 = (ymm0 * ymm1) +/- ymm2
5074; X86-NEXT:    retl
5075;
5076; X64-LABEL: test_mm256_mask_fmaddsub_pd:
5077; X64:       # %bb.0: # %entry
5078; X64-NEXT:    kmovw %edi, %k1
5079; X64-NEXT:    vfmaddsub132pd {{.*#+}} ymm0 = (ymm0 * ymm1) +/- ymm2
5080; X64-NEXT:    retq
5081entry:
5082  %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C) #9
5083  %1 = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
5084  %2 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %1) #9
5085  %3 = shufflevector <4 x double> %2, <4 x double> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
5086  %4 = bitcast i8 %__U to <8 x i1>
5087  %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
5088  %5 = select <4 x i1> %extract.i, <4 x double> %3, <4 x double> %__A
5089  ret <4 x double> %5
5090}
5091
5092define <4 x double> @test_mm256_mask_fmsubadd_pd(<4 x double> %__A, i8 zeroext %__U, <4 x double> %__B, <4 x double> %__C) {
5093; X86-LABEL: test_mm256_mask_fmsubadd_pd:
5094; X86:       # %bb.0: # %entry
5095; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
5096; X86-NEXT:    kmovw %eax, %k1
5097; X86-NEXT:    vfmsubadd132pd {{.*#+}} ymm0 = (ymm0 * ymm1) -/+ ymm2
5098; X86-NEXT:    retl
5099;
5100; X64-LABEL: test_mm256_mask_fmsubadd_pd:
5101; X64:       # %bb.0: # %entry
5102; X64-NEXT:    kmovw %edi, %k1
5103; X64-NEXT:    vfmsubadd132pd {{.*#+}} ymm0 = (ymm0 * ymm1) -/+ ymm2
5104; X64-NEXT:    retq
5105entry:
5106  %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
5107  %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %sub.i) #9
5108  %1 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C) #9
5109  %2 = shufflevector <4 x double> %1, <4 x double> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
5110  %3 = bitcast i8 %__U to <8 x i1>
5111  %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
5112  %4 = select <4 x i1> %extract.i, <4 x double> %2, <4 x double> %__A
5113  ret <4 x double> %4
5114}
5115
5116define <4 x double> @test_mm256_mask3_fmaddsub_pd(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C, i8 zeroext %__U) {
5117; X86-LABEL: test_mm256_mask3_fmaddsub_pd:
5118; X86:       # %bb.0: # %entry
5119; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
5120; X86-NEXT:    kmovw %eax, %k1
5121; X86-NEXT:    vfmaddsub231pd {{.*#+}} ymm2 = (ymm0 * ymm1) +/- ymm2
5122; X86-NEXT:    vmovapd %ymm2, %ymm0
5123; X86-NEXT:    retl
5124;
5125; X64-LABEL: test_mm256_mask3_fmaddsub_pd:
5126; X64:       # %bb.0: # %entry
5127; X64-NEXT:    kmovw %edi, %k1
5128; X64-NEXT:    vfmaddsub231pd {{.*#+}} ymm2 = (ymm0 * ymm1) +/- ymm2
5129; X64-NEXT:    vmovapd %ymm2, %ymm0
5130; X64-NEXT:    retq
5131entry:
5132  %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C) #9
5133  %1 = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
5134  %2 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %1) #9
5135  %3 = shufflevector <4 x double> %2, <4 x double> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
5136  %4 = bitcast i8 %__U to <8 x i1>
5137  %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
5138  %5 = select <4 x i1> %extract.i, <4 x double> %3, <4 x double> %__C
5139  ret <4 x double> %5
5140}
5141
5142define <4 x double> @test_mm256_maskz_fmaddsub_pd(i8 zeroext %__U, <4 x double> %__A, <4 x double> %__B, <4 x double> %__C) {
5143; X86-LABEL: test_mm256_maskz_fmaddsub_pd:
5144; X86:       # %bb.0: # %entry
5145; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
5146; X86-NEXT:    kmovw %eax, %k1
5147; X86-NEXT:    vfmaddsub213pd {{.*#+}} ymm0 = (ymm1 * ymm0) +/- ymm2
5148; X86-NEXT:    retl
5149;
5150; X64-LABEL: test_mm256_maskz_fmaddsub_pd:
5151; X64:       # %bb.0: # %entry
5152; X64-NEXT:    kmovw %edi, %k1
5153; X64-NEXT:    vfmaddsub213pd {{.*#+}} ymm0 = (ymm1 * ymm0) +/- ymm2
5154; X64-NEXT:    retq
5155entry:
5156  %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C) #9
5157  %1 = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
5158  %2 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %1) #9
5159  %3 = shufflevector <4 x double> %2, <4 x double> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
5160  %4 = bitcast i8 %__U to <8 x i1>
5161  %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
5162  %5 = select <4 x i1> %extract.i, <4 x double> %3, <4 x double> zeroinitializer
5163  ret <4 x double> %5
5164}
5165
5166define <4 x double> @test_mm256_maskz_fmsubadd_pd(i8 zeroext %__U, <4 x double> %__A, <4 x double> %__B, <4 x double> %__C) {
5167; X86-LABEL: test_mm256_maskz_fmsubadd_pd:
5168; X86:       # %bb.0: # %entry
5169; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
5170; X86-NEXT:    kmovw %eax, %k1
5171; X86-NEXT:    vfmsubadd213pd {{.*#+}} ymm0 = (ymm1 * ymm0) -/+ ymm2
5172; X86-NEXT:    retl
5173;
5174; X64-LABEL: test_mm256_maskz_fmsubadd_pd:
5175; X64:       # %bb.0: # %entry
5176; X64-NEXT:    kmovw %edi, %k1
5177; X64-NEXT:    vfmsubadd213pd {{.*#+}} ymm0 = (ymm1 * ymm0) -/+ ymm2
5178; X64-NEXT:    retq
5179entry:
5180  %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
5181  %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %sub.i) #9
5182  %1 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C) #9
5183  %2 = shufflevector <4 x double> %1, <4 x double> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
5184  %3 = bitcast i8 %__U to <8 x i1>
5185  %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
5186  %4 = select <4 x i1> %extract.i, <4 x double> %2, <4 x double> zeroinitializer
5187  ret <4 x double> %4
5188}
5189
5190define <4 x float> @test_mm_mask_fmaddsub_ps(<4 x float> %__A, i8 zeroext %__U, <4 x float> %__B, <4 x float> %__C) {
5191; X86-LABEL: test_mm_mask_fmaddsub_ps:
5192; X86:       # %bb.0: # %entry
5193; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
5194; X86-NEXT:    kmovw %eax, %k1
5195; X86-NEXT:    vfmaddsub132ps {{.*#+}} xmm0 = (xmm0 * xmm1) +/- xmm2
5196; X86-NEXT:    retl
5197;
5198; X64-LABEL: test_mm_mask_fmaddsub_ps:
5199; X64:       # %bb.0: # %entry
5200; X64-NEXT:    kmovw %edi, %k1
5201; X64-NEXT:    vfmaddsub132ps {{.*#+}} xmm0 = (xmm0 * xmm1) +/- xmm2
5202; X64-NEXT:    retq
5203entry:
5204  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C) #9
5205  %1 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
5206  %2 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %1) #9
5207  %3 = shufflevector <4 x float> %2, <4 x float> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
5208  %4 = bitcast i8 %__U to <8 x i1>
5209  %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
5210  %5 = select <4 x i1> %extract.i, <4 x float> %3, <4 x float> %__A
5211  ret <4 x float> %5
5212}
5213
5214define <4 x float> @test_mm_mask_fmsubadd_ps(<4 x float> %__A, i8 zeroext %__U, <4 x float> %__B, <4 x float> %__C) {
5215; X86-LABEL: test_mm_mask_fmsubadd_ps:
5216; X86:       # %bb.0: # %entry
5217; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
5218; X86-NEXT:    kmovw %eax, %k1
5219; X86-NEXT:    vfmsubadd132ps {{.*#+}} xmm0 = (xmm0 * xmm1) -/+ xmm2
5220; X86-NEXT:    retl
5221;
5222; X64-LABEL: test_mm_mask_fmsubadd_ps:
5223; X64:       # %bb.0: # %entry
5224; X64-NEXT:    kmovw %edi, %k1
5225; X64-NEXT:    vfmsubadd132ps {{.*#+}} xmm0 = (xmm0 * xmm1) -/+ xmm2
5226; X64-NEXT:    retq
5227entry:
5228  %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
5229  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %sub.i) #9
5230  %1 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C) #9
5231  %2 = shufflevector <4 x float> %1, <4 x float> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
5232  %3 = bitcast i8 %__U to <8 x i1>
5233  %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
5234  %4 = select <4 x i1> %extract.i, <4 x float> %2, <4 x float> %__A
5235  ret <4 x float> %4
5236}
5237
5238define <4 x float> @test_mm_mask3_fmaddsub_ps(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C, i8 zeroext %__U) {
5239; X86-LABEL: test_mm_mask3_fmaddsub_ps:
5240; X86:       # %bb.0: # %entry
5241; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
5242; X86-NEXT:    kmovw %eax, %k1
5243; X86-NEXT:    vfmaddsub231ps {{.*#+}} xmm2 = (xmm0 * xmm1) +/- xmm2
5244; X86-NEXT:    vmovaps %xmm2, %xmm0
5245; X86-NEXT:    retl
5246;
5247; X64-LABEL: test_mm_mask3_fmaddsub_ps:
5248; X64:       # %bb.0: # %entry
5249; X64-NEXT:    kmovw %edi, %k1
5250; X64-NEXT:    vfmaddsub231ps {{.*#+}} xmm2 = (xmm0 * xmm1) +/- xmm2
5251; X64-NEXT:    vmovaps %xmm2, %xmm0
5252; X64-NEXT:    retq
5253entry:
5254  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C) #9
5255  %1 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
5256  %2 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %1) #9
5257  %3 = shufflevector <4 x float> %2, <4 x float> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
5258  %4 = bitcast i8 %__U to <8 x i1>
5259  %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
5260  %5 = select <4 x i1> %extract.i, <4 x float> %3, <4 x float> %__C
5261  ret <4 x float> %5
5262}
5263
5264define <4 x float> @test_mm_maskz_fmaddsub_ps(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) {
5265; X86-LABEL: test_mm_maskz_fmaddsub_ps:
5266; X86:       # %bb.0: # %entry
5267; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
5268; X86-NEXT:    kmovw %eax, %k1
5269; X86-NEXT:    vfmaddsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) +/- xmm2
5270; X86-NEXT:    retl
5271;
5272; X64-LABEL: test_mm_maskz_fmaddsub_ps:
5273; X64:       # %bb.0: # %entry
5274; X64-NEXT:    kmovw %edi, %k1
5275; X64-NEXT:    vfmaddsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) +/- xmm2
5276; X64-NEXT:    retq
5277entry:
5278  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C) #9
5279  %1 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
5280  %2 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %1) #9
5281  %3 = shufflevector <4 x float> %2, <4 x float> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
5282  %4 = bitcast i8 %__U to <8 x i1>
5283  %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
5284  %5 = select <4 x i1> %extract.i, <4 x float> %3, <4 x float> zeroinitializer
5285  ret <4 x float> %5
5286}
5287
5288define <4 x float> @test_mm_maskz_fmsubadd_ps(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) {
5289; X86-LABEL: test_mm_maskz_fmsubadd_ps:
5290; X86:       # %bb.0: # %entry
5291; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
5292; X86-NEXT:    kmovw %eax, %k1
5293; X86-NEXT:    vfmsubadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) -/+ xmm2
5294; X86-NEXT:    retl
5295;
5296; X64-LABEL: test_mm_maskz_fmsubadd_ps:
5297; X64:       # %bb.0: # %entry
5298; X64-NEXT:    kmovw %edi, %k1
5299; X64-NEXT:    vfmsubadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) -/+ xmm2
5300; X64-NEXT:    retq
5301entry:
5302  %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
5303  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %sub.i) #9
5304  %1 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C) #9
5305  %2 = shufflevector <4 x float> %1, <4 x float> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
5306  %3 = bitcast i8 %__U to <8 x i1>
5307  %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
5308  %4 = select <4 x i1> %extract.i, <4 x float> %2, <4 x float> zeroinitializer
5309  ret <4 x float> %4
5310}
5311
5312define <8 x float> @test_mm256_mask_fmaddsub_ps(<8 x float> %__A, i8 zeroext %__U, <8 x float> %__B, <8 x float> %__C) {
5313; X86-LABEL: test_mm256_mask_fmaddsub_ps:
5314; X86:       # %bb.0: # %entry
5315; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
5316; X86-NEXT:    kmovw %eax, %k1
5317; X86-NEXT:    vfmaddsub132ps {{.*#+}} ymm0 = (ymm0 * ymm1) +/- ymm2
5318; X86-NEXT:    retl
5319;
5320; X64-LABEL: test_mm256_mask_fmaddsub_ps:
5321; X64:       # %bb.0: # %entry
5322; X64-NEXT:    kmovw %edi, %k1
5323; X64-NEXT:    vfmaddsub132ps {{.*#+}} ymm0 = (ymm0 * ymm1) +/- ymm2
5324; X64-NEXT:    retq
5325entry:
5326  %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C) #9
5327  %1 = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
5328  %2 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %1) #9
5329  %3 = shufflevector <8 x float> %2, <8 x float> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
5330  %4 = bitcast i8 %__U to <8 x i1>
5331  %5 = select <8 x i1> %4, <8 x float> %3, <8 x float> %__A
5332  ret <8 x float> %5
5333}
5334
5335define <8 x float> @test_mm256_mask_fmsubadd_ps(<8 x float> %__A, i8 zeroext %__U, <8 x float> %__B, <8 x float> %__C) {
5336; X86-LABEL: test_mm256_mask_fmsubadd_ps:
5337; X86:       # %bb.0: # %entry
5338; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
5339; X86-NEXT:    kmovw %eax, %k1
5340; X86-NEXT:    vfmsubadd132ps {{.*#+}} ymm0 = (ymm0 * ymm1) -/+ ymm2
5341; X86-NEXT:    retl
5342;
5343; X64-LABEL: test_mm256_mask_fmsubadd_ps:
5344; X64:       # %bb.0: # %entry
5345; X64-NEXT:    kmovw %edi, %k1
5346; X64-NEXT:    vfmsubadd132ps {{.*#+}} ymm0 = (ymm0 * ymm1) -/+ ymm2
5347; X64-NEXT:    retq
5348entry:
5349  %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
5350  %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %sub.i) #9
5351  %1 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C) #9
5352  %2 = shufflevector <8 x float> %1, <8 x float> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
5353  %3 = bitcast i8 %__U to <8 x i1>
5354  %4 = select <8 x i1> %3, <8 x float> %2, <8 x float> %__A
5355  ret <8 x float> %4
5356}
5357
5358define <8 x float> @test_mm256_mask3_fmaddsub_ps(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C, i8 zeroext %__U) {
5359; X86-LABEL: test_mm256_mask3_fmaddsub_ps:
5360; X86:       # %bb.0: # %entry
5361; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
5362; X86-NEXT:    kmovw %eax, %k1
5363; X86-NEXT:    vfmaddsub231ps {{.*#+}} ymm2 = (ymm0 * ymm1) +/- ymm2
5364; X86-NEXT:    vmovaps %ymm2, %ymm0
5365; X86-NEXT:    retl
5366;
5367; X64-LABEL: test_mm256_mask3_fmaddsub_ps:
5368; X64:       # %bb.0: # %entry
5369; X64-NEXT:    kmovw %edi, %k1
5370; X64-NEXT:    vfmaddsub231ps {{.*#+}} ymm2 = (ymm0 * ymm1) +/- ymm2
5371; X64-NEXT:    vmovaps %ymm2, %ymm0
5372; X64-NEXT:    retq
5373entry:
5374  %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C) #9
5375  %1 = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
5376  %2 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %1) #9
5377  %3 = shufflevector <8 x float> %2, <8 x float> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
5378  %4 = bitcast i8 %__U to <8 x i1>
5379  %5 = select <8 x i1> %4, <8 x float> %3, <8 x float> %__C
5380  ret <8 x float> %5
5381}
5382
5383define <8 x float> @test_mm256_maskz_fmaddsub_ps(i8 zeroext %__U, <8 x float> %__A, <8 x float> %__B, <8 x float> %__C) {
5384; X86-LABEL: test_mm256_maskz_fmaddsub_ps:
5385; X86:       # %bb.0: # %entry
5386; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
5387; X86-NEXT:    kmovw %eax, %k1
5388; X86-NEXT:    vfmaddsub213ps {{.*#+}} ymm0 = (ymm1 * ymm0) +/- ymm2
5389; X86-NEXT:    retl
5390;
5391; X64-LABEL: test_mm256_maskz_fmaddsub_ps:
5392; X64:       # %bb.0: # %entry
5393; X64-NEXT:    kmovw %edi, %k1
5394; X64-NEXT:    vfmaddsub213ps {{.*#+}} ymm0 = (ymm1 * ymm0) +/- ymm2
5395; X64-NEXT:    retq
5396entry:
5397  %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C) #9
5398  %1 = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
5399  %2 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %1) #9
5400  %3 = shufflevector <8 x float> %2, <8 x float> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
5401  %4 = bitcast i8 %__U to <8 x i1>
5402  %5 = select <8 x i1> %4, <8 x float> %3, <8 x float> zeroinitializer
5403  ret <8 x float> %5
5404}
5405
5406define <8 x float> @test_mm256_maskz_fmsubadd_ps(i8 zeroext %__U, <8 x float> %__A, <8 x float> %__B, <8 x float> %__C) {
5407; X86-LABEL: test_mm256_maskz_fmsubadd_ps:
5408; X86:       # %bb.0: # %entry
5409; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
5410; X86-NEXT:    kmovw %eax, %k1
5411; X86-NEXT:    vfmsubadd213ps {{.*#+}} ymm0 = (ymm1 * ymm0) -/+ ymm2
5412; X86-NEXT:    retl
5413;
5414; X64-LABEL: test_mm256_maskz_fmsubadd_ps:
5415; X64:       # %bb.0: # %entry
5416; X64-NEXT:    kmovw %edi, %k1
5417; X64-NEXT:    vfmsubadd213ps {{.*#+}} ymm0 = (ymm1 * ymm0) -/+ ymm2
5418; X64-NEXT:    retq
5419entry:
5420  %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
5421  %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %sub.i) #9
5422  %1 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C) #9
5423  %2 = shufflevector <8 x float> %1, <8 x float> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
5424  %3 = bitcast i8 %__U to <8 x i1>
5425  %4 = select <8 x i1> %3, <8 x float> %2, <8 x float> zeroinitializer
5426  ret <8 x float> %4
5427}
5428
5429define <2 x double> @test_mm_mask3_fmsub_pd(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C, i8 zeroext %__U) {
5430; X86-LABEL: test_mm_mask3_fmsub_pd:
5431; X86:       # %bb.0: # %entry
5432; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
5433; X86-NEXT:    kmovw %eax, %k1
5434; X86-NEXT:    vfmsub231pd {{.*#+}} xmm2 = (xmm0 * xmm1) - xmm2
5435; X86-NEXT:    vmovapd %xmm2, %xmm0
5436; X86-NEXT:    retl
5437;
5438; X64-LABEL: test_mm_mask3_fmsub_pd:
5439; X64:       # %bb.0: # %entry
5440; X64-NEXT:    kmovw %edi, %k1
5441; X64-NEXT:    vfmsub231pd {{.*#+}} xmm2 = (xmm0 * xmm1) - xmm2
5442; X64-NEXT:    vmovapd %xmm2, %xmm0
5443; X64-NEXT:    retq
5444entry:
5445  %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__C
5446  %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %sub.i) #9
5447  %1 = bitcast i8 %__U to <8 x i1>
5448  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
5449  %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> %__C
5450  ret <2 x double> %2
5451}
5452
5453define <4 x double> @test_mm256_mask3_fmsub_pd(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C, i8 zeroext %__U) {
5454; X86-LABEL: test_mm256_mask3_fmsub_pd:
5455; X86:       # %bb.0: # %entry
5456; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
5457; X86-NEXT:    kmovw %eax, %k1
5458; X86-NEXT:    vfmsub231pd {{.*#+}} ymm2 = (ymm0 * ymm1) - ymm2
5459; X86-NEXT:    vmovapd %ymm2, %ymm0
5460; X86-NEXT:    retl
5461;
5462; X64-LABEL: test_mm256_mask3_fmsub_pd:
5463; X64:       # %bb.0: # %entry
5464; X64-NEXT:    kmovw %edi, %k1
5465; X64-NEXT:    vfmsub231pd {{.*#+}} ymm2 = (ymm0 * ymm1) - ymm2
5466; X64-NEXT:    vmovapd %ymm2, %ymm0
5467; X64-NEXT:    retq
5468entry:
5469  %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
5470  %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %sub.i) #9
5471  %1 = bitcast i8 %__U to <8 x i1>
5472  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
5473  %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> %__C
5474  ret <4 x double> %2
5475}
5476
5477define <4 x float> @test_mm_mask3_fmsub_ps(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C, i8 zeroext %__U) {
5478; X86-LABEL: test_mm_mask3_fmsub_ps:
5479; X86:       # %bb.0: # %entry
5480; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
5481; X86-NEXT:    kmovw %eax, %k1
5482; X86-NEXT:    vfmsub231ps {{.*#+}} xmm2 = (xmm0 * xmm1) - xmm2
5483; X86-NEXT:    vmovaps %xmm2, %xmm0
5484; X86-NEXT:    retl
5485;
5486; X64-LABEL: test_mm_mask3_fmsub_ps:
5487; X64:       # %bb.0: # %entry
5488; X64-NEXT:    kmovw %edi, %k1
5489; X64-NEXT:    vfmsub231ps {{.*#+}} xmm2 = (xmm0 * xmm1) - xmm2
5490; X64-NEXT:    vmovaps %xmm2, %xmm0
5491; X64-NEXT:    retq
5492entry:
5493  %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
5494  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %sub.i) #9
5495  %1 = bitcast i8 %__U to <8 x i1>
5496  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
5497  %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> %__C
5498  ret <4 x float> %2
5499}
5500
5501define <8 x float> @test_mm256_mask3_fmsub_ps(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C, i8 zeroext %__U) {
5502; X86-LABEL: test_mm256_mask3_fmsub_ps:
5503; X86:       # %bb.0: # %entry
5504; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
5505; X86-NEXT:    kmovw %eax, %k1
5506; X86-NEXT:    vfmsub231ps {{.*#+}} ymm2 = (ymm0 * ymm1) - ymm2
5507; X86-NEXT:    vmovaps %ymm2, %ymm0
5508; X86-NEXT:    retl
5509;
5510; X64-LABEL: test_mm256_mask3_fmsub_ps:
5511; X64:       # %bb.0: # %entry
5512; X64-NEXT:    kmovw %edi, %k1
5513; X64-NEXT:    vfmsub231ps {{.*#+}} ymm2 = (ymm0 * ymm1) - ymm2
5514; X64-NEXT:    vmovaps %ymm2, %ymm0
5515; X64-NEXT:    retq
5516entry:
5517  %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
5518  %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %sub.i) #9
5519  %1 = bitcast i8 %__U to <8 x i1>
5520  %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> %__C
5521  ret <8 x float> %2
5522}
5523
5524define <2 x double> @test_mm_mask3_fmsubadd_pd(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C, i8 zeroext %__U) {
5525; X86-LABEL: test_mm_mask3_fmsubadd_pd:
5526; X86:       # %bb.0: # %entry
5527; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
5528; X86-NEXT:    kmovw %eax, %k1
5529; X86-NEXT:    vfmsubadd231pd {{.*#+}} xmm2 = (xmm0 * xmm1) -/+ xmm2
5530; X86-NEXT:    vmovapd %xmm2, %xmm0
5531; X86-NEXT:    retl
5532;
5533; X64-LABEL: test_mm_mask3_fmsubadd_pd:
5534; X64:       # %bb.0: # %entry
5535; X64-NEXT:    kmovw %edi, %k1
5536; X64-NEXT:    vfmsubadd231pd {{.*#+}} xmm2 = (xmm0 * xmm1) -/+ xmm2
5537; X64-NEXT:    vmovapd %xmm2, %xmm0
5538; X64-NEXT:    retq
5539entry:
5540  %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__C
5541  %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %sub.i) #9
5542  %1 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C) #9
5543  %2 = shufflevector <2 x double> %1, <2 x double> %0, <2 x i32> <i32 0, i32 3>
5544  %3 = bitcast i8 %__U to <8 x i1>
5545  %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
5546  %4 = select <2 x i1> %extract.i, <2 x double> %2, <2 x double> %__C
5547  ret <2 x double> %4
5548}
5549
5550define <4 x double> @test_mm256_mask3_fmsubadd_pd(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C, i8 zeroext %__U) {
5551; X86-LABEL: test_mm256_mask3_fmsubadd_pd:
5552; X86:       # %bb.0: # %entry
5553; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
5554; X86-NEXT:    kmovw %eax, %k1
5555; X86-NEXT:    vfmsubadd231pd {{.*#+}} ymm2 = (ymm0 * ymm1) -/+ ymm2
5556; X86-NEXT:    vmovapd %ymm2, %ymm0
5557; X86-NEXT:    retl
5558;
5559; X64-LABEL: test_mm256_mask3_fmsubadd_pd:
5560; X64:       # %bb.0: # %entry
5561; X64-NEXT:    kmovw %edi, %k1
5562; X64-NEXT:    vfmsubadd231pd {{.*#+}} ymm2 = (ymm0 * ymm1) -/+ ymm2
5563; X64-NEXT:    vmovapd %ymm2, %ymm0
5564; X64-NEXT:    retq
5565entry:
5566  %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
5567  %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %sub.i) #9
5568  %1 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C) #9
5569  %2 = shufflevector <4 x double> %1, <4 x double> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
5570  %3 = bitcast i8 %__U to <8 x i1>
5571  %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
5572  %4 = select <4 x i1> %extract.i, <4 x double> %2, <4 x double> %__C
5573  ret <4 x double> %4
5574}
5575
5576define <4 x float> @test_mm_mask3_fmsubadd_ps(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C, i8 zeroext %__U) {
5577; X86-LABEL: test_mm_mask3_fmsubadd_ps:
5578; X86:       # %bb.0: # %entry
5579; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
5580; X86-NEXT:    kmovw %eax, %k1
5581; X86-NEXT:    vfmsubadd231ps {{.*#+}} xmm2 = (xmm0 * xmm1) -/+ xmm2
5582; X86-NEXT:    vmovaps %xmm2, %xmm0
5583; X86-NEXT:    retl
5584;
5585; X64-LABEL: test_mm_mask3_fmsubadd_ps:
5586; X64:       # %bb.0: # %entry
5587; X64-NEXT:    kmovw %edi, %k1
5588; X64-NEXT:    vfmsubadd231ps {{.*#+}} xmm2 = (xmm0 * xmm1) -/+ xmm2
5589; X64-NEXT:    vmovaps %xmm2, %xmm0
5590; X64-NEXT:    retq
5591entry:
5592  %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
5593  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %sub.i) #9
5594  %1 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C) #9
5595  %2 = shufflevector <4 x float> %1, <4 x float> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
5596  %3 = bitcast i8 %__U to <8 x i1>
5597  %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
5598  %4 = select <4 x i1> %extract.i, <4 x float> %2, <4 x float> %__C
5599  ret <4 x float> %4
5600}
5601
5602define <8 x float> @test_mm256_mask3_fmsubadd_ps(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C, i8 zeroext %__U) {
5603; X86-LABEL: test_mm256_mask3_fmsubadd_ps:
5604; X86:       # %bb.0: # %entry
5605; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
5606; X86-NEXT:    kmovw %eax, %k1
5607; X86-NEXT:    vfmsubadd231ps {{.*#+}} ymm2 = (ymm0 * ymm1) -/+ ymm2
5608; X86-NEXT:    vmovaps %ymm2, %ymm0
5609; X86-NEXT:    retl
5610;
5611; X64-LABEL: test_mm256_mask3_fmsubadd_ps:
5612; X64:       # %bb.0: # %entry
5613; X64-NEXT:    kmovw %edi, %k1
5614; X64-NEXT:    vfmsubadd231ps {{.*#+}} ymm2 = (ymm0 * ymm1) -/+ ymm2
5615; X64-NEXT:    vmovaps %ymm2, %ymm0
5616; X64-NEXT:    retq
5617entry:
5618  %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
5619  %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %sub.i) #9
5620  %1 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C) #9
5621  %2 = shufflevector <8 x float> %1, <8 x float> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
5622  %3 = bitcast i8 %__U to <8 x i1>
5623  %4 = select <8 x i1> %3, <8 x float> %2, <8 x float> %__C
5624  ret <8 x float> %4
5625}
5626
5627define <2 x double> @test_mm_mask_fnmadd_pd(<2 x double> %__A, i8 zeroext %__U, <2 x double> %__B, <2 x double> %__C) {
5628; X86-LABEL: test_mm_mask_fnmadd_pd:
5629; X86:       # %bb.0: # %entry
5630; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
5631; X86-NEXT:    kmovw %eax, %k1
5632; X86-NEXT:    vfnmadd132pd {{.*#+}} xmm0 = -(xmm0 * xmm1) + xmm2
5633; X86-NEXT:    retl
5634;
5635; X64-LABEL: test_mm_mask_fnmadd_pd:
5636; X64:       # %bb.0: # %entry
5637; X64-NEXT:    kmovw %edi, %k1
5638; X64-NEXT:    vfnmadd132pd {{.*#+}} xmm0 = -(xmm0 * xmm1) + xmm2
5639; X64-NEXT:    retq
5640entry:
5641  %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__B
5642  %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %sub.i, <2 x double> %__C) #9
5643  %1 = bitcast i8 %__U to <8 x i1>
5644  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
5645  %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> %__A
5646  ret <2 x double> %2
5647}
5648
5649define <4 x double> @test_mm256_mask_fnmadd_pd(<4 x double> %__A, i8 zeroext %__U, <4 x double> %__B, <4 x double> %__C) {
5650; X86-LABEL: test_mm256_mask_fnmadd_pd:
5651; X86:       # %bb.0: # %entry
5652; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
5653; X86-NEXT:    kmovw %eax, %k1
5654; X86-NEXT:    vfnmadd132pd {{.*#+}} ymm0 = -(ymm0 * ymm1) + ymm2
5655; X86-NEXT:    retl
5656;
5657; X64-LABEL: test_mm256_mask_fnmadd_pd:
5658; X64:       # %bb.0: # %entry
5659; X64-NEXT:    kmovw %edi, %k1
5660; X64-NEXT:    vfnmadd132pd {{.*#+}} ymm0 = -(ymm0 * ymm1) + ymm2
5661; X64-NEXT:    retq
5662entry:
5663  %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__B
5664  %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %sub.i, <4 x double> %__C) #9
5665  %1 = bitcast i8 %__U to <8 x i1>
5666  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
5667  %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> %__A
5668  ret <4 x double> %2
5669}
5670
5671define <4 x float> @test_mm_mask_fnmadd_ps(<4 x float> %__A, i8 zeroext %__U, <4 x float> %__B, <4 x float> %__C) {
5672; X86-LABEL: test_mm_mask_fnmadd_ps:
5673; X86:       # %bb.0: # %entry
5674; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
5675; X86-NEXT:    kmovw %eax, %k1
5676; X86-NEXT:    vfnmadd132ps {{.*#+}} xmm0 = -(xmm0 * xmm1) + xmm2
5677; X86-NEXT:    retl
5678;
5679; X64-LABEL: test_mm_mask_fnmadd_ps:
5680; X64:       # %bb.0: # %entry
5681; X64-NEXT:    kmovw %edi, %k1
5682; X64-NEXT:    vfnmadd132ps {{.*#+}} xmm0 = -(xmm0 * xmm1) + xmm2
5683; X64-NEXT:    retq
5684entry:
5685  %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__B
5686  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %sub.i, <4 x float> %__C) #9
5687  %1 = bitcast i8 %__U to <8 x i1>
5688  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
5689  %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> %__A
5690  ret <4 x float> %2
5691}
5692
5693define <8 x float> @test_mm256_mask_fnmadd_ps(<8 x float> %__A, i8 zeroext %__U, <8 x float> %__B, <8 x float> %__C) {
5694; X86-LABEL: test_mm256_mask_fnmadd_ps:
5695; X86:       # %bb.0: # %entry
5696; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
5697; X86-NEXT:    kmovw %eax, %k1
5698; X86-NEXT:    vfnmadd132ps {{.*#+}} ymm0 = -(ymm0 * ymm1) + ymm2
5699; X86-NEXT:    retl
5700;
5701; X64-LABEL: test_mm256_mask_fnmadd_ps:
5702; X64:       # %bb.0: # %entry
5703; X64-NEXT:    kmovw %edi, %k1
5704; X64-NEXT:    vfnmadd132ps {{.*#+}} ymm0 = -(ymm0 * ymm1) + ymm2
5705; X64-NEXT:    retq
5706entry:
5707  %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__B
5708  %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %sub.i, <8 x float> %__C) #9
5709  %1 = bitcast i8 %__U to <8 x i1>
5710  %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> %__A
5711  ret <8 x float> %2
5712}
5713
5714define <2 x double> @test_mm_mask_fnmsub_pd(<2 x double> %__A, i8 zeroext %__U, <2 x double> %__B, <2 x double> %__C) {
5715; X86-LABEL: test_mm_mask_fnmsub_pd:
5716; X86:       # %bb.0: # %entry
5717; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
5718; X86-NEXT:    kmovw %eax, %k1
5719; X86-NEXT:    vfnmsub132pd {{.*#+}} xmm0 = -(xmm0 * xmm1) - xmm2
5720; X86-NEXT:    retl
5721;
5722; X64-LABEL: test_mm_mask_fnmsub_pd:
5723; X64:       # %bb.0: # %entry
5724; X64-NEXT:    kmovw %edi, %k1
5725; X64-NEXT:    vfnmsub132pd {{.*#+}} xmm0 = -(xmm0 * xmm1) - xmm2
5726; X64-NEXT:    retq
5727entry:
5728  %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__B
5729  %sub1.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__C
5730  %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %sub.i, <2 x double> %sub1.i) #9
5731  %1 = bitcast i8 %__U to <8 x i1>
5732  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
5733  %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> %__A
5734  ret <2 x double> %2
5735}
5736
5737define <2 x double> @test_mm_mask3_fnmsub_pd(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C, i8 zeroext %__U) {
5738; X86-LABEL: test_mm_mask3_fnmsub_pd:
5739; X86:       # %bb.0: # %entry
5740; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
5741; X86-NEXT:    kmovw %eax, %k1
5742; X86-NEXT:    vfnmsub231pd {{.*#+}} xmm2 = -(xmm0 * xmm1) - xmm2
5743; X86-NEXT:    vmovapd %xmm2, %xmm0
5744; X86-NEXT:    retl
5745;
5746; X64-LABEL: test_mm_mask3_fnmsub_pd:
5747; X64:       # %bb.0: # %entry
5748; X64-NEXT:    kmovw %edi, %k1
5749; X64-NEXT:    vfnmsub231pd {{.*#+}} xmm2 = -(xmm0 * xmm1) - xmm2
5750; X64-NEXT:    vmovapd %xmm2, %xmm0
5751; X64-NEXT:    retq
5752entry:
5753  %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__B
5754  %sub1.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__C
5755  %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %sub.i, <2 x double> %sub1.i) #9
5756  %1 = bitcast i8 %__U to <8 x i1>
5757  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
5758  %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> %__C
5759  ret <2 x double> %2
5760}
5761
5762define <4 x double> @test_mm256_mask_fnmsub_pd(<4 x double> %__A, i8 zeroext %__U, <4 x double> %__B, <4 x double> %__C) {
5763; X86-LABEL: test_mm256_mask_fnmsub_pd:
5764; X86:       # %bb.0: # %entry
5765; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
5766; X86-NEXT:    kmovw %eax, %k1
5767; X86-NEXT:    vfnmsub132pd {{.*#+}} ymm0 = -(ymm0 * ymm1) - ymm2
5768; X86-NEXT:    retl
5769;
5770; X64-LABEL: test_mm256_mask_fnmsub_pd:
5771; X64:       # %bb.0: # %entry
5772; X64-NEXT:    kmovw %edi, %k1
5773; X64-NEXT:    vfnmsub132pd {{.*#+}} ymm0 = -(ymm0 * ymm1) - ymm2
5774; X64-NEXT:    retq
5775entry:
5776  %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__B
5777  %sub1.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
5778  %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %sub.i, <4 x double> %sub1.i) #9
5779  %1 = bitcast i8 %__U to <8 x i1>
5780  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
5781  %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> %__A
5782  ret <4 x double> %2
5783}
5784
5785define <4 x double> @test_mm256_mask3_fnmsub_pd(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C, i8 zeroext %__U) {
5786; X86-LABEL: test_mm256_mask3_fnmsub_pd:
5787; X86:       # %bb.0: # %entry
5788; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
5789; X86-NEXT:    kmovw %eax, %k1
5790; X86-NEXT:    vfnmsub231pd {{.*#+}} ymm2 = -(ymm0 * ymm1) - ymm2
5791; X86-NEXT:    vmovapd %ymm2, %ymm0
5792; X86-NEXT:    retl
5793;
5794; X64-LABEL: test_mm256_mask3_fnmsub_pd:
5795; X64:       # %bb.0: # %entry
5796; X64-NEXT:    kmovw %edi, %k1
5797; X64-NEXT:    vfnmsub231pd {{.*#+}} ymm2 = -(ymm0 * ymm1) - ymm2
5798; X64-NEXT:    vmovapd %ymm2, %ymm0
5799; X64-NEXT:    retq
5800entry:
5801  %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__B
5802  %sub1.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
5803  %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %sub.i, <4 x double> %sub1.i) #9
5804  %1 = bitcast i8 %__U to <8 x i1>
5805  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
5806  %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> %__C
5807  ret <4 x double> %2
5808}
5809
5810define <4 x float> @test_mm_mask_fnmsub_ps(<4 x float> %__A, i8 zeroext %__U, <4 x float> %__B, <4 x float> %__C) {
5811; X86-LABEL: test_mm_mask_fnmsub_ps:
5812; X86:       # %bb.0: # %entry
5813; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
5814; X86-NEXT:    kmovw %eax, %k1
5815; X86-NEXT:    vfnmsub132ps {{.*#+}} xmm0 = -(xmm0 * xmm1) - xmm2
5816; X86-NEXT:    retl
5817;
5818; X64-LABEL: test_mm_mask_fnmsub_ps:
5819; X64:       # %bb.0: # %entry
5820; X64-NEXT:    kmovw %edi, %k1
5821; X64-NEXT:    vfnmsub132ps {{.*#+}} xmm0 = -(xmm0 * xmm1) - xmm2
5822; X64-NEXT:    retq
5823entry:
5824  %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__B
5825  %sub1.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
5826  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %sub.i, <4 x float> %sub1.i) #9
5827  %1 = bitcast i8 %__U to <8 x i1>
5828  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
5829  %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> %__A
5830  ret <4 x float> %2
5831}
5832
5833define <4 x float> @test_mm_mask3_fnmsub_ps(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C, i8 zeroext %__U) {
5834; X86-LABEL: test_mm_mask3_fnmsub_ps:
5835; X86:       # %bb.0: # %entry
5836; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
5837; X86-NEXT:    kmovw %eax, %k1
5838; X86-NEXT:    vfnmsub231ps {{.*#+}} xmm2 = -(xmm0 * xmm1) - xmm2
5839; X86-NEXT:    vmovaps %xmm2, %xmm0
5840; X86-NEXT:    retl
5841;
5842; X64-LABEL: test_mm_mask3_fnmsub_ps:
5843; X64:       # %bb.0: # %entry
5844; X64-NEXT:    kmovw %edi, %k1
5845; X64-NEXT:    vfnmsub231ps {{.*#+}} xmm2 = -(xmm0 * xmm1) - xmm2
5846; X64-NEXT:    vmovaps %xmm2, %xmm0
5847; X64-NEXT:    retq
5848entry:
5849  %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__B
5850  %sub1.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
5851  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %sub.i, <4 x float> %sub1.i) #9
5852  %1 = bitcast i8 %__U to <8 x i1>
5853  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
5854  %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> %__C
5855  ret <4 x float> %2
5856}
5857
5858define <8 x float> @test_mm256_mask_fnmsub_ps(<8 x float> %__A, i8 zeroext %__U, <8 x float> %__B, <8 x float> %__C) {
5859; X86-LABEL: test_mm256_mask_fnmsub_ps:
5860; X86:       # %bb.0: # %entry
5861; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
5862; X86-NEXT:    kmovw %eax, %k1
5863; X86-NEXT:    vfnmsub132ps {{.*#+}} ymm0 = -(ymm0 * ymm1) - ymm2
5864; X86-NEXT:    retl
5865;
5866; X64-LABEL: test_mm256_mask_fnmsub_ps:
5867; X64:       # %bb.0: # %entry
5868; X64-NEXT:    kmovw %edi, %k1
5869; X64-NEXT:    vfnmsub132ps {{.*#+}} ymm0 = -(ymm0 * ymm1) - ymm2
5870; X64-NEXT:    retq
5871entry:
5872  %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__B
5873  %sub1.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
5874  %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %sub.i, <8 x float> %sub1.i) #9
5875  %1 = bitcast i8 %__U to <8 x i1>
5876  %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> %__A
5877  ret <8 x float> %2
5878}
5879
5880define <8 x float> @test_mm256_mask3_fnmsub_ps(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C, i8 zeroext %__U) {
5881; X86-LABEL: test_mm256_mask3_fnmsub_ps:
5882; X86:       # %bb.0: # %entry
5883; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
5884; X86-NEXT:    kmovw %eax, %k1
5885; X86-NEXT:    vfnmsub231ps {{.*#+}} ymm2 = -(ymm0 * ymm1) - ymm2
5886; X86-NEXT:    vmovaps %ymm2, %ymm0
5887; X86-NEXT:    retl
5888;
5889; X64-LABEL: test_mm256_mask3_fnmsub_ps:
5890; X64:       # %bb.0: # %entry
5891; X64-NEXT:    kmovw %edi, %k1
5892; X64-NEXT:    vfnmsub231ps {{.*#+}} ymm2 = -(ymm0 * ymm1) - ymm2
5893; X64-NEXT:    vmovaps %ymm2, %ymm0
5894; X64-NEXT:    retq
5895entry:
5896  %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__B
5897  %sub1.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
5898  %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %sub.i, <8 x float> %sub1.i) #9
5899  %1 = bitcast i8 %__U to <8 x i1>
5900  %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> %__C
5901  ret <8 x float> %2
5902}
5903
5904define <2 x double> @test_mm_mask_expandloadu_pd(<2 x double> %__W, i8 zeroext %__U, i8* readonly %__P) {
5905; X86-LABEL: test_mm_mask_expandloadu_pd:
5906; X86:       # %bb.0: # %entry
5907; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
5908; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
5909; X86-NEXT:    kmovw %ecx, %k1
5910; X86-NEXT:    vexpandpd (%eax), %xmm0 {%k1}
5911; X86-NEXT:    retl
5912;
5913; X64-LABEL: test_mm_mask_expandloadu_pd:
5914; X64:       # %bb.0: # %entry
5915; X64-NEXT:    kmovw %edi, %k1
5916; X64-NEXT:    vexpandpd (%rsi), %xmm0 {%k1}
5917; X64-NEXT:    retq
5918entry:
5919  %0 = bitcast i8* %__P to double*
5920  %1 = bitcast i8 %__U to <8 x i1>
5921  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
5922  %2 = tail call <2 x double> @llvm.masked.expandload.v2f64(double* %0, <2 x i1> %extract.i, <2 x double> %__W)
5923  ret <2 x double> %2
5924}
5925
5926define <2 x double> @test_mm_maskz_expandloadu_pd(i8 zeroext %__U, i8* readonly %__P) {
5927; X86-LABEL: test_mm_maskz_expandloadu_pd:
5928; X86:       # %bb.0: # %entry
5929; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
5930; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
5931; X86-NEXT:    kmovw %ecx, %k1
5932; X86-NEXT:    vexpandpd (%eax), %xmm0 {%k1} {z}
5933; X86-NEXT:    retl
5934;
5935; X64-LABEL: test_mm_maskz_expandloadu_pd:
5936; X64:       # %bb.0: # %entry
5937; X64-NEXT:    kmovw %edi, %k1
5938; X64-NEXT:    vexpandpd (%rsi), %xmm0 {%k1} {z}
5939; X64-NEXT:    retq
5940entry:
5941  %0 = bitcast i8* %__P to double*
5942  %1 = bitcast i8 %__U to <8 x i1>
5943  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
5944  %2 = tail call <2 x double> @llvm.masked.expandload.v2f64(double* %0, <2 x i1> %extract.i, <2 x double> zeroinitializer)
5945  ret <2 x double> %2
5946}
5947
5948define <4 x double> @test_mm256_mask_expandloadu_pd(<4 x double> %__W, i8 zeroext %__U, i8* readonly %__P) {
5949; X86-LABEL: test_mm256_mask_expandloadu_pd:
5950; X86:       # %bb.0: # %entry
5951; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
5952; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
5953; X86-NEXT:    kmovw %ecx, %k1
5954; X86-NEXT:    vexpandpd (%eax), %ymm0 {%k1}
5955; X86-NEXT:    retl
5956;
5957; X64-LABEL: test_mm256_mask_expandloadu_pd:
5958; X64:       # %bb.0: # %entry
5959; X64-NEXT:    kmovw %edi, %k1
5960; X64-NEXT:    vexpandpd (%rsi), %ymm0 {%k1}
5961; X64-NEXT:    retq
5962entry:
5963  %0 = bitcast i8* %__P to double*
5964  %1 = bitcast i8 %__U to <8 x i1>
5965  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
5966  %2 = tail call <4 x double> @llvm.masked.expandload.v4f64(double* %0, <4 x i1> %extract.i, <4 x double> %__W)
5967  ret <4 x double> %2
5968}
5969
5970define <4 x double> @test_mm256_maskz_expandloadu_pd(i8 zeroext %__U, i8* readonly %__P) {
5971; X86-LABEL: test_mm256_maskz_expandloadu_pd:
5972; X86:       # %bb.0: # %entry
5973; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
5974; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
5975; X86-NEXT:    kmovw %ecx, %k1
5976; X86-NEXT:    vexpandpd (%eax), %ymm0 {%k1} {z}
5977; X86-NEXT:    retl
5978;
5979; X64-LABEL: test_mm256_maskz_expandloadu_pd:
5980; X64:       # %bb.0: # %entry
5981; X64-NEXT:    kmovw %edi, %k1
5982; X64-NEXT:    vexpandpd (%rsi), %ymm0 {%k1} {z}
5983; X64-NEXT:    retq
5984entry:
5985  %0 = bitcast i8* %__P to double*
5986  %1 = bitcast i8 %__U to <8 x i1>
5987  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
5988  %2 = tail call <4 x double> @llvm.masked.expandload.v4f64(double* %0, <4 x i1> %extract.i, <4 x double> zeroinitializer)
5989  ret <4 x double> %2
5990}
5991
5992define <2 x i64> @test_mm_mask_expandloadu_epi64(<2 x i64> %__W, i8 zeroext %__U, i8* readonly %__P) {
5993; X86-LABEL: test_mm_mask_expandloadu_epi64:
5994; X86:       # %bb.0: # %entry
5995; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
5996; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
5997; X86-NEXT:    kmovw %ecx, %k1
5998; X86-NEXT:    vpexpandq (%eax), %xmm0 {%k1}
5999; X86-NEXT:    retl
6000;
6001; X64-LABEL: test_mm_mask_expandloadu_epi64:
6002; X64:       # %bb.0: # %entry
6003; X64-NEXT:    kmovw %edi, %k1
6004; X64-NEXT:    vpexpandq (%rsi), %xmm0 {%k1}
6005; X64-NEXT:    retq
6006entry:
6007  %0 = bitcast i8* %__P to i64*
6008  %1 = bitcast i8 %__U to <8 x i1>
6009  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
6010  %2 = tail call <2 x i64> @llvm.masked.expandload.v2i64(i64* %0, <2 x i1> %extract.i, <2 x i64> %__W) #10
6011  ret <2 x i64> %2
6012}
6013
6014define <2 x i64> @test_mm_maskz_expandloadu_epi64(i8 zeroext %__U, i8* readonly %__P) {
6015; X86-LABEL: test_mm_maskz_expandloadu_epi64:
6016; X86:       # %bb.0: # %entry
6017; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
6018; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
6019; X86-NEXT:    kmovw %ecx, %k1
6020; X86-NEXT:    vpexpandq (%eax), %xmm0 {%k1} {z}
6021; X86-NEXT:    retl
6022;
6023; X64-LABEL: test_mm_maskz_expandloadu_epi64:
6024; X64:       # %bb.0: # %entry
6025; X64-NEXT:    kmovw %edi, %k1
6026; X64-NEXT:    vpexpandq (%rsi), %xmm0 {%k1} {z}
6027; X64-NEXT:    retq
6028entry:
6029  %0 = bitcast i8* %__P to i64*
6030  %1 = bitcast i8 %__U to <8 x i1>
6031  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
6032  %2 = tail call <2 x i64> @llvm.masked.expandload.v2i64(i64* %0, <2 x i1> %extract.i, <2 x i64> zeroinitializer)
6033  ret <2 x i64> %2
6034}
6035
6036define <4 x i64> @test_mm256_mask_expandloadu_epi64(<4 x i64> %__W, i8 zeroext %__U, i8* readonly %__P) {
6037; X86-LABEL: test_mm256_mask_expandloadu_epi64:
6038; X86:       # %bb.0: # %entry
6039; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
6040; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
6041; X86-NEXT:    kmovw %ecx, %k1
6042; X86-NEXT:    vpexpandq (%eax), %ymm0 {%k1}
6043; X86-NEXT:    retl
6044;
6045; X64-LABEL: test_mm256_mask_expandloadu_epi64:
6046; X64:       # %bb.0: # %entry
6047; X64-NEXT:    kmovw %edi, %k1
6048; X64-NEXT:    vpexpandq (%rsi), %ymm0 {%k1}
6049; X64-NEXT:    retq
6050entry:
6051  %0 = bitcast i8* %__P to i64*
6052  %1 = bitcast i8 %__U to <8 x i1>
6053  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6054  %2 = tail call <4 x i64> @llvm.masked.expandload.v4i64(i64* %0, <4 x i1> %extract.i, <4 x i64> %__W) #10
6055  ret <4 x i64> %2
6056}
6057
6058define <4 x i64> @test_mm256_maskz_expandloadu_epi64(i8 zeroext %__U, i8* readonly %__P) {
6059; X86-LABEL: test_mm256_maskz_expandloadu_epi64:
6060; X86:       # %bb.0: # %entry
6061; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
6062; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
6063; X86-NEXT:    kmovw %ecx, %k1
6064; X86-NEXT:    vpexpandq (%eax), %ymm0 {%k1} {z}
6065; X86-NEXT:    retl
6066;
6067; X64-LABEL: test_mm256_maskz_expandloadu_epi64:
6068; X64:       # %bb.0: # %entry
6069; X64-NEXT:    kmovw %edi, %k1
6070; X64-NEXT:    vpexpandq (%rsi), %ymm0 {%k1} {z}
6071; X64-NEXT:    retq
6072entry:
6073  %0 = bitcast i8* %__P to i64*
6074  %1 = bitcast i8 %__U to <8 x i1>
6075  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6076  %2 = tail call <4 x i64> @llvm.masked.expandload.v4i64(i64* %0, <4 x i1> %extract.i, <4 x i64> zeroinitializer)
6077  ret <4 x i64> %2
6078}
6079
6080define <4 x float> @test_mm_mask_expandloadu_ps(<4 x float> %__W, i8 zeroext %__U, i8* readonly %__P) {
6081; X86-LABEL: test_mm_mask_expandloadu_ps:
6082; X86:       # %bb.0: # %entry
6083; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
6084; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
6085; X86-NEXT:    kmovw %ecx, %k1
6086; X86-NEXT:    vexpandps (%eax), %xmm0 {%k1}
6087; X86-NEXT:    retl
6088;
6089; X64-LABEL: test_mm_mask_expandloadu_ps:
6090; X64:       # %bb.0: # %entry
6091; X64-NEXT:    kmovw %edi, %k1
6092; X64-NEXT:    vexpandps (%rsi), %xmm0 {%k1}
6093; X64-NEXT:    retq
6094entry:
6095  %0 = bitcast i8* %__P to float*
6096  %1 = bitcast i8 %__U to <8 x i1>
6097  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6098  %2 = tail call <4 x float> @llvm.masked.expandload.v4f32(float* %0, <4 x i1> %extract.i, <4 x float> %__W)
6099  ret <4 x float> %2
6100}
6101
6102define <4 x float> @test_mm_maskz_expandloadu_ps(i8 zeroext %__U, i8* readonly %__P) {
6103; X86-LABEL: test_mm_maskz_expandloadu_ps:
6104; X86:       # %bb.0: # %entry
6105; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
6106; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
6107; X86-NEXT:    kmovw %ecx, %k1
6108; X86-NEXT:    vexpandps (%eax), %xmm0 {%k1} {z}
6109; X86-NEXT:    retl
6110;
6111; X64-LABEL: test_mm_maskz_expandloadu_ps:
6112; X64:       # %bb.0: # %entry
6113; X64-NEXT:    kmovw %edi, %k1
6114; X64-NEXT:    vexpandps (%rsi), %xmm0 {%k1} {z}
6115; X64-NEXT:    retq
6116entry:
6117  %0 = bitcast i8* %__P to float*
6118  %1 = bitcast i8 %__U to <8 x i1>
6119  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6120  %2 = tail call <4 x float> @llvm.masked.expandload.v4f32(float* %0, <4 x i1> %extract.i, <4 x float> zeroinitializer)
6121  ret <4 x float> %2
6122}
6123
6124define <8 x float> @test_mm256_mask_expandloadu_ps(<8 x float> %__W, i8 zeroext %__U, i8* readonly %__P) {
6125; X86-LABEL: test_mm256_mask_expandloadu_ps:
6126; X86:       # %bb.0: # %entry
6127; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
6128; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
6129; X86-NEXT:    kmovw %ecx, %k1
6130; X86-NEXT:    vexpandps (%eax), %ymm0 {%k1}
6131; X86-NEXT:    retl
6132;
6133; X64-LABEL: test_mm256_mask_expandloadu_ps:
6134; X64:       # %bb.0: # %entry
6135; X64-NEXT:    kmovw %edi, %k1
6136; X64-NEXT:    vexpandps (%rsi), %ymm0 {%k1}
6137; X64-NEXT:    retq
6138entry:
6139  %0 = bitcast i8* %__P to float*
6140  %1 = bitcast i8 %__U to <8 x i1>
6141  %2 = tail call <8 x float> @llvm.masked.expandload.v8f32(float* %0, <8 x i1> %1, <8 x float> %__W)
6142  ret <8 x float> %2
6143}
6144
6145define <8 x float> @test_mm256_maskz_expandloadu_ps(i8 zeroext %__U, i8* readonly %__P) {
6146; X86-LABEL: test_mm256_maskz_expandloadu_ps:
6147; X86:       # %bb.0: # %entry
6148; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
6149; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
6150; X86-NEXT:    kmovw %ecx, %k1
6151; X86-NEXT:    vexpandps (%eax), %ymm0 {%k1} {z}
6152; X86-NEXT:    retl
6153;
6154; X64-LABEL: test_mm256_maskz_expandloadu_ps:
6155; X64:       # %bb.0: # %entry
6156; X64-NEXT:    kmovw %edi, %k1
6157; X64-NEXT:    vexpandps (%rsi), %ymm0 {%k1} {z}
6158; X64-NEXT:    retq
6159entry:
6160  %0 = bitcast i8* %__P to float*
6161  %1 = bitcast i8 %__U to <8 x i1>
6162  %2 = tail call <8 x float> @llvm.masked.expandload.v8f32(float* %0, <8 x i1> %1, <8 x float> zeroinitializer)
6163  ret <8 x float> %2
6164}
6165
6166define <2 x i64> @test_mm_mask_expandloadu_epi32(<2 x i64> %__W, i8 zeroext %__U, i8* readonly %__P) {
6167; X86-LABEL: test_mm_mask_expandloadu_epi32:
6168; X86:       # %bb.0: # %entry
6169; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
6170; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
6171; X86-NEXT:    kmovw %ecx, %k1
6172; X86-NEXT:    vpexpandd (%eax), %xmm0 {%k1}
6173; X86-NEXT:    retl
6174;
6175; X64-LABEL: test_mm_mask_expandloadu_epi32:
6176; X64:       # %bb.0: # %entry
6177; X64-NEXT:    kmovw %edi, %k1
6178; X64-NEXT:    vpexpandd (%rsi), %xmm0 {%k1}
6179; X64-NEXT:    retq
6180entry:
6181  %0 = bitcast <2 x i64> %__W to <4 x i32>
6182  %1 = bitcast i8* %__P to i32*
6183  %2 = bitcast i8 %__U to <8 x i1>
6184  %extract.i = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6185  %3 = tail call <4 x i32> @llvm.masked.expandload.v4i32(i32* %1, <4 x i1> %extract.i, <4 x i32> %0)
6186  %4 = bitcast <4 x i32> %3 to <2 x i64>
6187  ret <2 x i64> %4
6188}
6189
6190define <2 x i64> @test_mm_maskz_expandloadu_epi32(i8 zeroext %__U, i8* readonly %__P) {
6191; X86-LABEL: test_mm_maskz_expandloadu_epi32:
6192; X86:       # %bb.0: # %entry
6193; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
6194; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
6195; X86-NEXT:    kmovw %ecx, %k1
6196; X86-NEXT:    vpexpandd (%eax), %xmm0 {%k1} {z}
6197; X86-NEXT:    retl
6198;
6199; X64-LABEL: test_mm_maskz_expandloadu_epi32:
6200; X64:       # %bb.0: # %entry
6201; X64-NEXT:    kmovw %edi, %k1
6202; X64-NEXT:    vpexpandd (%rsi), %xmm0 {%k1} {z}
6203; X64-NEXT:    retq
6204entry:
6205  %0 = bitcast i8* %__P to i32*
6206  %1 = bitcast i8 %__U to <8 x i1>
6207  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6208  %2 = tail call <4 x i32> @llvm.masked.expandload.v4i32(i32* %0, <4 x i1> %extract.i, <4 x i32> zeroinitializer)
6209  %3 = bitcast <4 x i32> %2 to <2 x i64>
6210  ret <2 x i64> %3
6211}
6212
6213define <4 x i64> @test_mm256_mask_expandloadu_epi32(<4 x i64> %__W, i8 zeroext %__U, i8* readonly %__P) {
6214; X86-LABEL: test_mm256_mask_expandloadu_epi32:
6215; X86:       # %bb.0: # %entry
6216; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
6217; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
6218; X86-NEXT:    kmovw %ecx, %k1
6219; X86-NEXT:    vpexpandd (%eax), %ymm0 {%k1}
6220; X86-NEXT:    retl
6221;
6222; X64-LABEL: test_mm256_mask_expandloadu_epi32:
6223; X64:       # %bb.0: # %entry
6224; X64-NEXT:    kmovw %edi, %k1
6225; X64-NEXT:    vpexpandd (%rsi), %ymm0 {%k1}
6226; X64-NEXT:    retq
6227entry:
6228  %0 = bitcast <4 x i64> %__W to <8 x i32>
6229  %1 = bitcast i8* %__P to i32*
6230  %2 = bitcast i8 %__U to <8 x i1>
6231  %3 = tail call <8 x i32> @llvm.masked.expandload.v8i32(i32* %1, <8 x i1> %2, <8 x i32> %0)
6232  %4 = bitcast <8 x i32> %3 to <4 x i64>
6233  ret <4 x i64> %4
6234}
6235
6236define <4 x i64> @test_mm256_maskz_expandloadu_epi32(i8 zeroext %__U, i8* readonly %__P) {
6237; X86-LABEL: test_mm256_maskz_expandloadu_epi32:
6238; X86:       # %bb.0: # %entry
6239; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
6240; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
6241; X86-NEXT:    kmovw %ecx, %k1
6242; X86-NEXT:    vpexpandd (%eax), %ymm0 {%k1} {z}
6243; X86-NEXT:    retl
6244;
6245; X64-LABEL: test_mm256_maskz_expandloadu_epi32:
6246; X64:       # %bb.0: # %entry
6247; X64-NEXT:    kmovw %edi, %k1
6248; X64-NEXT:    vpexpandd (%rsi), %ymm0 {%k1} {z}
6249; X64-NEXT:    retq
6250entry:
6251  %0 = bitcast i8* %__P to i32*
6252  %1 = bitcast i8 %__U to <8 x i1>
6253  %2 = tail call <8 x i32> @llvm.masked.expandload.v8i32(i32* %0, <8 x i1> %1, <8 x i32> zeroinitializer)
6254  %3 = bitcast <8 x i32> %2 to <4 x i64>
6255  ret <4 x i64> %3
6256}
6257
6258define void @test_mm_mask_compressstoreu_pd(i8* %__P, i8 zeroext %__U, <2 x double> %__A) {
6259; X86-LABEL: test_mm_mask_compressstoreu_pd:
6260; X86:       # %bb.0: # %entry
6261; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
6262; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
6263; X86-NEXT:    kmovw %eax, %k1
6264; X86-NEXT:    vcompresspd %xmm0, (%ecx) {%k1}
6265; X86-NEXT:    retl
6266;
6267; X64-LABEL: test_mm_mask_compressstoreu_pd:
6268; X64:       # %bb.0: # %entry
6269; X64-NEXT:    kmovw %esi, %k1
6270; X64-NEXT:    vcompresspd %xmm0, (%rdi) {%k1}
6271; X64-NEXT:    retq
6272entry:
6273  %0 = bitcast i8* %__P to double*
6274  %1 = bitcast i8 %__U to <8 x i1>
6275  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
6276  tail call void @llvm.masked.compressstore.v2f64(<2 x double> %__A, double* %0, <2 x i1> %extract.i)
6277  ret void
6278}
6279
6280define void @test_mm256_mask_compressstoreu_pd(i8* %__P, i8 zeroext %__U, <4 x double> %__A) {
6281; X86-LABEL: test_mm256_mask_compressstoreu_pd:
6282; X86:       # %bb.0: # %entry
6283; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
6284; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
6285; X86-NEXT:    kmovw %eax, %k1
6286; X86-NEXT:    vcompresspd %ymm0, (%ecx) {%k1}
6287; X86-NEXT:    vzeroupper
6288; X86-NEXT:    retl
6289;
6290; X64-LABEL: test_mm256_mask_compressstoreu_pd:
6291; X64:       # %bb.0: # %entry
6292; X64-NEXT:    kmovw %esi, %k1
6293; X64-NEXT:    vcompresspd %ymm0, (%rdi) {%k1}
6294; X64-NEXT:    vzeroupper
6295; X64-NEXT:    retq
6296entry:
6297  %0 = bitcast i8* %__P to double*
6298  %1 = bitcast i8 %__U to <8 x i1>
6299  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6300  tail call void @llvm.masked.compressstore.v4f64(<4 x double> %__A, double* %0, <4 x i1> %extract.i)
6301  ret void
6302}
6303
6304define void @test_mm_mask_compressstoreu_epi64(i8* %__P, i8 zeroext %__U, <2 x i64> %__A) {
6305; X86-LABEL: test_mm_mask_compressstoreu_epi64:
6306; X86:       # %bb.0: # %entry
6307; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
6308; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
6309; X86-NEXT:    kmovw %eax, %k1
6310; X86-NEXT:    vpcompressq %xmm0, (%ecx) {%k1}
6311; X86-NEXT:    retl
6312;
6313; X64-LABEL: test_mm_mask_compressstoreu_epi64:
6314; X64:       # %bb.0: # %entry
6315; X64-NEXT:    kmovw %esi, %k1
6316; X64-NEXT:    vpcompressq %xmm0, (%rdi) {%k1}
6317; X64-NEXT:    retq
6318entry:
6319  %0 = bitcast i8* %__P to i64*
6320  %1 = bitcast i8 %__U to <8 x i1>
6321  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
6322  tail call void @llvm.masked.compressstore.v2i64(<2 x i64> %__A, i64* %0, <2 x i1> %extract.i)
6323  ret void
6324}
6325
6326define void @test_mm256_mask_compressstoreu_epi64(i8* %__P, i8 zeroext %__U, <4 x i64> %__A) {
6327; X86-LABEL: test_mm256_mask_compressstoreu_epi64:
6328; X86:       # %bb.0: # %entry
6329; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
6330; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
6331; X86-NEXT:    kmovw %eax, %k1
6332; X86-NEXT:    vpcompressq %ymm0, (%ecx) {%k1}
6333; X86-NEXT:    vzeroupper
6334; X86-NEXT:    retl
6335;
6336; X64-LABEL: test_mm256_mask_compressstoreu_epi64:
6337; X64:       # %bb.0: # %entry
6338; X64-NEXT:    kmovw %esi, %k1
6339; X64-NEXT:    vpcompressq %ymm0, (%rdi) {%k1}
6340; X64-NEXT:    vzeroupper
6341; X64-NEXT:    retq
6342entry:
6343  %0 = bitcast i8* %__P to i64*
6344  %1 = bitcast i8 %__U to <8 x i1>
6345  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6346  tail call void @llvm.masked.compressstore.v4i64(<4 x i64> %__A, i64* %0, <4 x i1> %extract.i)
6347  ret void
6348}
6349
6350define void @test_mm_mask_compressstoreu_ps(i8* %__P, i8 zeroext %__U, <4 x float> %__A) {
6351; X86-LABEL: test_mm_mask_compressstoreu_ps:
6352; X86:       # %bb.0: # %entry
6353; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
6354; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
6355; X86-NEXT:    kmovw %eax, %k1
6356; X86-NEXT:    vcompressps %xmm0, (%ecx) {%k1}
6357; X86-NEXT:    retl
6358;
6359; X64-LABEL: test_mm_mask_compressstoreu_ps:
6360; X64:       # %bb.0: # %entry
6361; X64-NEXT:    kmovw %esi, %k1
6362; X64-NEXT:    vcompressps %xmm0, (%rdi) {%k1}
6363; X64-NEXT:    retq
6364entry:
6365  %0 = bitcast i8* %__P to float*
6366  %1 = bitcast i8 %__U to <8 x i1>
6367  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6368  tail call void @llvm.masked.compressstore.v4f32(<4 x float> %__A, float* %0, <4 x i1> %extract.i)
6369  ret void
6370}
6371
6372define void @test_mm256_mask_compressstoreu_ps(i8* %__P, i8 zeroext %__U, <8 x float> %__A) {
6373; X86-LABEL: test_mm256_mask_compressstoreu_ps:
6374; X86:       # %bb.0: # %entry
6375; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
6376; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
6377; X86-NEXT:    kmovw %eax, %k1
6378; X86-NEXT:    vcompressps %ymm0, (%ecx) {%k1}
6379; X86-NEXT:    vzeroupper
6380; X86-NEXT:    retl
6381;
6382; X64-LABEL: test_mm256_mask_compressstoreu_ps:
6383; X64:       # %bb.0: # %entry
6384; X64-NEXT:    kmovw %esi, %k1
6385; X64-NEXT:    vcompressps %ymm0, (%rdi) {%k1}
6386; X64-NEXT:    vzeroupper
6387; X64-NEXT:    retq
6388entry:
6389  %0 = bitcast i8* %__P to float*
6390  %1 = bitcast i8 %__U to <8 x i1>
6391  tail call void @llvm.masked.compressstore.v8f32(<8 x float> %__A, float* %0, <8 x i1> %1)
6392  ret void
6393}
6394
6395define void @test_mm_mask_compressstoreu_epi32(i8* %__P, i8 zeroext %__U, <2 x i64> %__A) {
6396; X86-LABEL: test_mm_mask_compressstoreu_epi32:
6397; X86:       # %bb.0: # %entry
6398; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
6399; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
6400; X86-NEXT:    kmovw %eax, %k1
6401; X86-NEXT:    vpcompressd %xmm0, (%ecx) {%k1}
6402; X86-NEXT:    retl
6403;
6404; X64-LABEL: test_mm_mask_compressstoreu_epi32:
6405; X64:       # %bb.0: # %entry
6406; X64-NEXT:    kmovw %esi, %k1
6407; X64-NEXT:    vpcompressd %xmm0, (%rdi) {%k1}
6408; X64-NEXT:    retq
6409entry:
6410  %0 = bitcast <2 x i64> %__A to <4 x i32>
6411  %1 = bitcast i8* %__P to i32*
6412  %2 = bitcast i8 %__U to <8 x i1>
6413  %extract.i = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6414  tail call void @llvm.masked.compressstore.v4i32(<4 x i32> %0, i32* %1, <4 x i1> %extract.i)
6415  ret void
6416}
6417
6418define void @test_mm256_mask_compressstoreu_epi32(i8* %__P, i8 zeroext %__U, <4 x i64> %__A) {
6419; X86-LABEL: test_mm256_mask_compressstoreu_epi32:
6420; X86:       # %bb.0: # %entry
6421; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
6422; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
6423; X86-NEXT:    kmovw %eax, %k1
6424; X86-NEXT:    vpcompressd %ymm0, (%ecx) {%k1}
6425; X86-NEXT:    vzeroupper
6426; X86-NEXT:    retl
6427;
6428; X64-LABEL: test_mm256_mask_compressstoreu_epi32:
6429; X64:       # %bb.0: # %entry
6430; X64-NEXT:    kmovw %esi, %k1
6431; X64-NEXT:    vpcompressd %ymm0, (%rdi) {%k1}
6432; X64-NEXT:    vzeroupper
6433; X64-NEXT:    retq
6434entry:
6435  %0 = bitcast <4 x i64> %__A to <8 x i32>
6436  %1 = bitcast i8* %__P to i32*
6437  %2 = bitcast i8 %__U to <8 x i1>
6438  tail call void @llvm.masked.compressstore.v8i32(<8 x i32> %0, i32* %1, <8 x i1> %2) #10
6439  ret void
6440}
6441
6442
6443declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>) #8
6444declare <4 x double> @llvm.fma.v4f64(<4 x double>, <4 x double>, <4 x double>) #8
6445declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>) #8
6446declare <8 x float> @llvm.fma.v8f32(<8 x float>, <8 x float>, <8 x float>) #8
6447
6448define <2 x double> @test_mm_mask_sqrt_pd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A) {
6449; X86-LABEL: test_mm_mask_sqrt_pd:
6450; X86:       # %bb.0: # %entry
6451; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
6452; X86-NEXT:    kmovw %eax, %k1
6453; X86-NEXT:    vsqrtpd %xmm1, %xmm0 {%k1}
6454; X86-NEXT:    retl
6455;
6456; X64-LABEL: test_mm_mask_sqrt_pd:
6457; X64:       # %bb.0: # %entry
6458; X64-NEXT:    kmovw %edi, %k1
6459; X64-NEXT:    vsqrtpd %xmm1, %xmm0 {%k1}
6460; X64-NEXT:    retq
6461entry:
6462  %0 = tail call <2 x double> @llvm.sqrt.v2f64(<2 x double> %__A) #2
6463  %1 = bitcast i8 %__U to <8 x i1>
6464  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
6465  %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> %__W
6466  ret <2 x double> %2
6467}
6468
6469declare <2 x double> @llvm.sqrt.v2f64(<2 x double>)
6470
6471define <2 x double> @test_mm_maskz_sqrt_pd(i8 zeroext %__U, <2 x double> %__A) {
6472; X86-LABEL: test_mm_maskz_sqrt_pd:
6473; X86:       # %bb.0: # %entry
6474; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
6475; X86-NEXT:    kmovw %eax, %k1
6476; X86-NEXT:    vsqrtpd %xmm0, %xmm0 {%k1} {z}
6477; X86-NEXT:    retl
6478;
6479; X64-LABEL: test_mm_maskz_sqrt_pd:
6480; X64:       # %bb.0: # %entry
6481; X64-NEXT:    kmovw %edi, %k1
6482; X64-NEXT:    vsqrtpd %xmm0, %xmm0 {%k1} {z}
6483; X64-NEXT:    retq
6484entry:
6485  %0 = tail call <2 x double> @llvm.sqrt.v2f64(<2 x double> %__A) #2
6486  %1 = bitcast i8 %__U to <8 x i1>
6487  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
6488  %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> zeroinitializer
6489  ret <2 x double> %2
6490}
6491
6492define <4 x double> @test_mm256_mask_sqrt_pd(<4 x double> %__W, i8 zeroext %__U, <4 x double> %__A) {
6493; X86-LABEL: test_mm256_mask_sqrt_pd:
6494; X86:       # %bb.0: # %entry
6495; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
6496; X86-NEXT:    kmovw %eax, %k1
6497; X86-NEXT:    vsqrtpd %ymm1, %ymm0 {%k1}
6498; X86-NEXT:    retl
6499;
6500; X64-LABEL: test_mm256_mask_sqrt_pd:
6501; X64:       # %bb.0: # %entry
6502; X64-NEXT:    kmovw %edi, %k1
6503; X64-NEXT:    vsqrtpd %ymm1, %ymm0 {%k1}
6504; X64-NEXT:    retq
6505entry:
6506  %0 = tail call <4 x double> @llvm.sqrt.v4f64(<4 x double> %__A) #2
6507  %1 = bitcast i8 %__U to <8 x i1>
6508  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6509  %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> %__W
6510  ret <4 x double> %2
6511}
6512
6513declare <4 x double> @llvm.sqrt.v4f64(<4 x double>)
6514
6515define <4 x double> @test_mm256_maskz_sqrt_pd(i8 zeroext %__U, <4 x double> %__A) {
6516; X86-LABEL: test_mm256_maskz_sqrt_pd:
6517; X86:       # %bb.0: # %entry
6518; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
6519; X86-NEXT:    kmovw %eax, %k1
6520; X86-NEXT:    vsqrtpd %ymm0, %ymm0 {%k1} {z}
6521; X86-NEXT:    retl
6522;
6523; X64-LABEL: test_mm256_maskz_sqrt_pd:
6524; X64:       # %bb.0: # %entry
6525; X64-NEXT:    kmovw %edi, %k1
6526; X64-NEXT:    vsqrtpd %ymm0, %ymm0 {%k1} {z}
6527; X64-NEXT:    retq
6528entry:
6529  %0 = tail call <4 x double> @llvm.sqrt.v4f64(<4 x double> %__A) #2
6530  %1 = bitcast i8 %__U to <8 x i1>
6531  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6532  %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> zeroinitializer
6533  ret <4 x double> %2
6534}
6535
6536define <4 x float> @test_mm_mask_sqrt_ps(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A) {
6537; X86-LABEL: test_mm_mask_sqrt_ps:
6538; X86:       # %bb.0: # %entry
6539; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
6540; X86-NEXT:    kmovw %eax, %k1
6541; X86-NEXT:    vsqrtps %xmm1, %xmm0 {%k1}
6542; X86-NEXT:    retl
6543;
6544; X64-LABEL: test_mm_mask_sqrt_ps:
6545; X64:       # %bb.0: # %entry
6546; X64-NEXT:    kmovw %edi, %k1
6547; X64-NEXT:    vsqrtps %xmm1, %xmm0 {%k1}
6548; X64-NEXT:    retq
6549entry:
6550  %0 = tail call <4 x float> @llvm.sqrt.v4f32(<4 x float> %__A) #2
6551  %1 = bitcast i8 %__U to <8 x i1>
6552  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6553  %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> %__W
6554  ret <4 x float> %2
6555}
6556
6557declare <4 x float> @llvm.sqrt.v4f32(<4 x float>)
6558
6559define <4 x float> @test_mm_maskz_sqrt_ps(i8 zeroext %__U, <4 x float> %__A) {
6560; X86-LABEL: test_mm_maskz_sqrt_ps:
6561; X86:       # %bb.0: # %entry
6562; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
6563; X86-NEXT:    kmovw %eax, %k1
6564; X86-NEXT:    vsqrtps %xmm0, %xmm0 {%k1} {z}
6565; X86-NEXT:    retl
6566;
6567; X64-LABEL: test_mm_maskz_sqrt_ps:
6568; X64:       # %bb.0: # %entry
6569; X64-NEXT:    kmovw %edi, %k1
6570; X64-NEXT:    vsqrtps %xmm0, %xmm0 {%k1} {z}
6571; X64-NEXT:    retq
6572entry:
6573  %0 = tail call <4 x float> @llvm.sqrt.v4f32(<4 x float> %__A) #2
6574  %1 = bitcast i8 %__U to <8 x i1>
6575  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6576  %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> zeroinitializer
6577  ret <4 x float> %2
6578}
6579
6580define <8 x float> @test_mm256_mask_sqrt_ps(<8 x float> %__W, i8 zeroext %__U, <8 x float> %__A) {
6581; X86-LABEL: test_mm256_mask_sqrt_ps:
6582; X86:       # %bb.0: # %entry
6583; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
6584; X86-NEXT:    kmovw %eax, %k1
6585; X86-NEXT:    vsqrtps %ymm1, %ymm0 {%k1}
6586; X86-NEXT:    retl
6587;
6588; X64-LABEL: test_mm256_mask_sqrt_ps:
6589; X64:       # %bb.0: # %entry
6590; X64-NEXT:    kmovw %edi, %k1
6591; X64-NEXT:    vsqrtps %ymm1, %ymm0 {%k1}
6592; X64-NEXT:    retq
6593entry:
6594  %0 = tail call <8 x float> @llvm.sqrt.v8f32(<8 x float> %__A) #2
6595  %1 = bitcast i8 %__U to <8 x i1>
6596  %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> %__W
6597  ret <8 x float> %2
6598}
6599
6600define <8 x float> @test_mm256_maskz_sqrt_ps(i8 zeroext %__U, <8 x float> %__A) {
6601; X86-LABEL: test_mm256_maskz_sqrt_ps:
6602; X86:       # %bb.0: # %entry
6603; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
6604; X86-NEXT:    kmovw %eax, %k1
6605; X86-NEXT:    vsqrtps %ymm0, %ymm0 {%k1} {z}
6606; X86-NEXT:    retl
6607;
6608; X64-LABEL: test_mm256_maskz_sqrt_ps:
6609; X64:       # %bb.0: # %entry
6610; X64-NEXT:    kmovw %edi, %k1
6611; X64-NEXT:    vsqrtps %ymm0, %ymm0 {%k1} {z}
6612; X64-NEXT:    retq
6613entry:
6614  %0 = tail call <8 x float> @llvm.sqrt.v8f32(<8 x float> %__A) #2
6615  %1 = bitcast i8 %__U to <8 x i1>
6616  %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> zeroinitializer
6617  ret <8 x float> %2
6618}
6619
6620declare <8 x float> @llvm.sqrt.v8f32(<8 x float>)
6621
6622define <2 x i64> @test_mm_rol_epi32(<2 x i64> %__A) {
6623; CHECK-LABEL: test_mm_rol_epi32:
6624; CHECK:       # %bb.0: # %entry
6625; CHECK-NEXT:    vprold $5, %xmm0, %xmm0
6626; CHECK-NEXT:    ret{{[l|q]}}
6627entry:
6628  %0 = bitcast <2 x i64> %__A to <4 x i32>
6629  %1 = tail call <4 x i32> @llvm.x86.avx512.prol.d.128(<4 x i32> %0, i32 5)
6630  %2 = bitcast <4 x i32> %1 to <2 x i64>
6631  ret <2 x i64> %2
6632}
6633
6634declare <4 x i32> @llvm.x86.avx512.prol.d.128(<4 x i32>, i32)
6635
6636define <2 x i64> @test_mm_mask_rol_epi32(<2 x i64> %__W, i8 zeroext %__U, <2 x i64> %__A) {
6637; X86-LABEL: test_mm_mask_rol_epi32:
6638; X86:       # %bb.0: # %entry
6639; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
6640; X86-NEXT:    kmovw %eax, %k1
6641; X86-NEXT:    vprold $5, %xmm1, %xmm0 {%k1}
6642; X86-NEXT:    retl
6643;
6644; X64-LABEL: test_mm_mask_rol_epi32:
6645; X64:       # %bb.0: # %entry
6646; X64-NEXT:    kmovw %edi, %k1
6647; X64-NEXT:    vprold $5, %xmm1, %xmm0 {%k1}
6648; X64-NEXT:    retq
6649entry:
6650  %0 = bitcast <2 x i64> %__A to <4 x i32>
6651  %1 = tail call <4 x i32> @llvm.x86.avx512.prol.d.128(<4 x i32> %0, i32 5)
6652  %2 = bitcast <2 x i64> %__W to <4 x i32>
6653  %3 = bitcast i8 %__U to <8 x i1>
6654  %extract = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6655  %4 = select <4 x i1> %extract, <4 x i32> %1, <4 x i32> %2
6656  %5 = bitcast <4 x i32> %4 to <2 x i64>
6657  ret <2 x i64> %5
6658}
6659
6660define <2 x i64> @test_mm_maskz_rol_epi32(i8 zeroext %__U, <2 x i64> %__A) {
6661; X86-LABEL: test_mm_maskz_rol_epi32:
6662; X86:       # %bb.0: # %entry
6663; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
6664; X86-NEXT:    kmovw %eax, %k1
6665; X86-NEXT:    vprold $5, %xmm0, %xmm0 {%k1} {z}
6666; X86-NEXT:    retl
6667;
6668; X64-LABEL: test_mm_maskz_rol_epi32:
6669; X64:       # %bb.0: # %entry
6670; X64-NEXT:    kmovw %edi, %k1
6671; X64-NEXT:    vprold $5, %xmm0, %xmm0 {%k1} {z}
6672; X64-NEXT:    retq
6673entry:
6674  %0 = bitcast <2 x i64> %__A to <4 x i32>
6675  %1 = tail call <4 x i32> @llvm.x86.avx512.prol.d.128(<4 x i32> %0, i32 5)
6676  %2 = bitcast i8 %__U to <8 x i1>
6677  %extract = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6678  %3 = select <4 x i1> %extract, <4 x i32> %1, <4 x i32> zeroinitializer
6679  %4 = bitcast <4 x i32> %3 to <2 x i64>
6680  ret <2 x i64> %4
6681}
6682
6683define <4 x i64> @test_mm256_rol_epi32(<4 x i64> %__A) {
6684; CHECK-LABEL: test_mm256_rol_epi32:
6685; CHECK:       # %bb.0: # %entry
6686; CHECK-NEXT:    vprold $5, %ymm0, %ymm0
6687; CHECK-NEXT:    ret{{[l|q]}}
6688entry:
6689  %0 = bitcast <4 x i64> %__A to <8 x i32>
6690  %1 = tail call <8 x i32> @llvm.x86.avx512.prol.d.256(<8 x i32> %0, i32 5)
6691  %2 = bitcast <8 x i32> %1 to <4 x i64>
6692  ret <4 x i64> %2
6693}
6694
6695declare <8 x i32> @llvm.x86.avx512.prol.d.256(<8 x i32>, i32)
6696
6697define <4 x i64> @test_mm256_mask_rol_epi32(<4 x i64> %__W, i8 zeroext %__U, <4 x i64> %__A) {
6698; X86-LABEL: test_mm256_mask_rol_epi32:
6699; X86:       # %bb.0: # %entry
6700; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
6701; X86-NEXT:    kmovw %eax, %k1
6702; X86-NEXT:    vprold $5, %ymm1, %ymm0 {%k1}
6703; X86-NEXT:    retl
6704;
6705; X64-LABEL: test_mm256_mask_rol_epi32:
6706; X64:       # %bb.0: # %entry
6707; X64-NEXT:    kmovw %edi, %k1
6708; X64-NEXT:    vprold $5, %ymm1, %ymm0 {%k1}
6709; X64-NEXT:    retq
6710entry:
6711  %0 = bitcast <4 x i64> %__A to <8 x i32>
6712  %1 = tail call <8 x i32> @llvm.x86.avx512.prol.d.256(<8 x i32> %0, i32 5)
6713  %2 = bitcast <4 x i64> %__W to <8 x i32>
6714  %3 = bitcast i8 %__U to <8 x i1>
6715  %4 = select <8 x i1> %3, <8 x i32> %1, <8 x i32> %2
6716  %5 = bitcast <8 x i32> %4 to <4 x i64>
6717  ret <4 x i64> %5
6718}
6719
6720define <4 x i64> @test_mm256_maskz_rol_epi32(i8 zeroext %__U, <4 x i64> %__A) {
6721; X86-LABEL: test_mm256_maskz_rol_epi32:
6722; X86:       # %bb.0: # %entry
6723; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
6724; X86-NEXT:    kmovw %eax, %k1
6725; X86-NEXT:    vprold $5, %ymm0, %ymm0 {%k1} {z}
6726; X86-NEXT:    retl
6727;
6728; X64-LABEL: test_mm256_maskz_rol_epi32:
6729; X64:       # %bb.0: # %entry
6730; X64-NEXT:    kmovw %edi, %k1
6731; X64-NEXT:    vprold $5, %ymm0, %ymm0 {%k1} {z}
6732; X64-NEXT:    retq
6733entry:
6734  %0 = bitcast <4 x i64> %__A to <8 x i32>
6735  %1 = tail call <8 x i32> @llvm.x86.avx512.prol.d.256(<8 x i32> %0, i32 5)
6736  %2 = bitcast i8 %__U to <8 x i1>
6737  %3 = select <8 x i1> %2, <8 x i32> %1, <8 x i32> zeroinitializer
6738  %4 = bitcast <8 x i32> %3 to <4 x i64>
6739  ret <4 x i64> %4
6740}
6741
6742define <2 x i64> @test_mm_rol_epi64(<2 x i64> %__A) {
6743; CHECK-LABEL: test_mm_rol_epi64:
6744; CHECK:       # %bb.0: # %entry
6745; CHECK-NEXT:    vprolq $5, %xmm0, %xmm0
6746; CHECK-NEXT:    ret{{[l|q]}}
6747entry:
6748  %0 = tail call <2 x i64> @llvm.x86.avx512.prol.q.128(<2 x i64> %__A, i32 5)
6749  ret <2 x i64> %0
6750}
6751
6752declare <2 x i64> @llvm.x86.avx512.prol.q.128(<2 x i64>, i32)
6753
6754define <2 x i64> @test_mm_mask_rol_epi64(<2 x i64> %__W, i8 zeroext %__U, <2 x i64> %__A) {
6755; X86-LABEL: test_mm_mask_rol_epi64:
6756; X86:       # %bb.0: # %entry
6757; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
6758; X86-NEXT:    kmovw %eax, %k1
6759; X86-NEXT:    vprolq $5, %xmm1, %xmm0 {%k1}
6760; X86-NEXT:    retl
6761;
6762; X64-LABEL: test_mm_mask_rol_epi64:
6763; X64:       # %bb.0: # %entry
6764; X64-NEXT:    kmovw %edi, %k1
6765; X64-NEXT:    vprolq $5, %xmm1, %xmm0 {%k1}
6766; X64-NEXT:    retq
6767entry:
6768  %0 = tail call <2 x i64> @llvm.x86.avx512.prol.q.128(<2 x i64> %__A, i32 5)
6769  %1 = bitcast i8 %__U to <8 x i1>
6770  %extract = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
6771  %2 = select <2 x i1> %extract, <2 x i64> %0, <2 x i64> %__W
6772  ret <2 x i64> %2
6773}
6774
6775define <2 x i64> @test_mm_maskz_rol_epi64(i8 zeroext %__U, <2 x i64> %__A) {
6776; X86-LABEL: test_mm_maskz_rol_epi64:
6777; X86:       # %bb.0: # %entry
6778; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
6779; X86-NEXT:    kmovw %eax, %k1
6780; X86-NEXT:    vprolq $5, %xmm0, %xmm0 {%k1} {z}
6781; X86-NEXT:    retl
6782;
6783; X64-LABEL: test_mm_maskz_rol_epi64:
6784; X64:       # %bb.0: # %entry
6785; X64-NEXT:    kmovw %edi, %k1
6786; X64-NEXT:    vprolq $5, %xmm0, %xmm0 {%k1} {z}
6787; X64-NEXT:    retq
6788entry:
6789  %0 = tail call <2 x i64> @llvm.x86.avx512.prol.q.128(<2 x i64> %__A, i32 5)
6790  %1 = bitcast i8 %__U to <8 x i1>
6791  %extract = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
6792  %2 = select <2 x i1> %extract, <2 x i64> %0, <2 x i64> zeroinitializer
6793  ret <2 x i64> %2
6794}
6795
6796define <4 x i64> @test_mm256_rol_epi64(<4 x i64> %__A) {
6797; CHECK-LABEL: test_mm256_rol_epi64:
6798; CHECK:       # %bb.0: # %entry
6799; CHECK-NEXT:    vprolq $5, %ymm0, %ymm0
6800; CHECK-NEXT:    ret{{[l|q]}}
6801entry:
6802  %0 = tail call <4 x i64> @llvm.x86.avx512.prol.q.256(<4 x i64> %__A, i32 5)
6803  ret <4 x i64> %0
6804}
6805
6806declare <4 x i64> @llvm.x86.avx512.prol.q.256(<4 x i64>, i32)
6807
6808define <4 x i64> @test_mm256_mask_rol_epi64(<4 x i64> %__W, i8 zeroext %__U, <4 x i64> %__A) {
6809; X86-LABEL: test_mm256_mask_rol_epi64:
6810; X86:       # %bb.0: # %entry
6811; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
6812; X86-NEXT:    kmovw %eax, %k1
6813; X86-NEXT:    vprolq $5, %ymm1, %ymm0 {%k1}
6814; X86-NEXT:    retl
6815;
6816; X64-LABEL: test_mm256_mask_rol_epi64:
6817; X64:       # %bb.0: # %entry
6818; X64-NEXT:    kmovw %edi, %k1
6819; X64-NEXT:    vprolq $5, %ymm1, %ymm0 {%k1}
6820; X64-NEXT:    retq
6821entry:
6822  %0 = tail call <4 x i64> @llvm.x86.avx512.prol.q.256(<4 x i64> %__A, i32 5)
6823  %1 = bitcast i8 %__U to <8 x i1>
6824  %extract = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6825  %2 = select <4 x i1> %extract, <4 x i64> %0, <4 x i64> %__W
6826  ret <4 x i64> %2
6827}
6828
6829define <4 x i64> @test_mm256_maskz_rol_epi64(i8 zeroext %__U, <4 x i64> %__A) {
6830; X86-LABEL: test_mm256_maskz_rol_epi64:
6831; X86:       # %bb.0: # %entry
6832; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
6833; X86-NEXT:    kmovw %eax, %k1
6834; X86-NEXT:    vprolq $5, %ymm0, %ymm0 {%k1} {z}
6835; X86-NEXT:    retl
6836;
6837; X64-LABEL: test_mm256_maskz_rol_epi64:
6838; X64:       # %bb.0: # %entry
6839; X64-NEXT:    kmovw %edi, %k1
6840; X64-NEXT:    vprolq $5, %ymm0, %ymm0 {%k1} {z}
6841; X64-NEXT:    retq
6842entry:
6843  %0 = tail call <4 x i64> @llvm.x86.avx512.prol.q.256(<4 x i64> %__A, i32 5)
6844  %1 = bitcast i8 %__U to <8 x i1>
6845  %extract = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6846  %2 = select <4 x i1> %extract, <4 x i64> %0, <4 x i64> zeroinitializer
6847  ret <4 x i64> %2
6848}
6849
6850define <2 x i64> @test_mm_rolv_epi32(<2 x i64> %__A, <2 x i64> %__B) {
6851; CHECK-LABEL: test_mm_rolv_epi32:
6852; CHECK:       # %bb.0: # %entry
6853; CHECK-NEXT:    vprolvd %xmm1, %xmm0, %xmm0
6854; CHECK-NEXT:    ret{{[l|q]}}
6855entry:
6856  %0 = bitcast <2 x i64> %__A to <4 x i32>
6857  %1 = bitcast <2 x i64> %__B to <4 x i32>
6858  %2 = tail call <4 x i32> @llvm.x86.avx512.prolv.d.128(<4 x i32> %0, <4 x i32> %1)
6859  %3 = bitcast <4 x i32> %2 to <2 x i64>
6860  ret <2 x i64> %3
6861}
6862
6863define <2 x i64> @test_mm_mask_rolv_epi32(<2 x i64> %__W, i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) {
6864; X86-LABEL: test_mm_mask_rolv_epi32:
6865; X86:       # %bb.0: # %entry
6866; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
6867; X86-NEXT:    kmovw %eax, %k1
6868; X86-NEXT:    vprolvd %xmm2, %xmm1, %xmm0 {%k1}
6869; X86-NEXT:    retl
6870;
6871; X64-LABEL: test_mm_mask_rolv_epi32:
6872; X64:       # %bb.0: # %entry
6873; X64-NEXT:    kmovw %edi, %k1
6874; X64-NEXT:    vprolvd %xmm2, %xmm1, %xmm0 {%k1}
6875; X64-NEXT:    retq
6876entry:
6877  %0 = bitcast <2 x i64> %__A to <4 x i32>
6878  %1 = bitcast <2 x i64> %__B to <4 x i32>
6879  %2 = tail call <4 x i32> @llvm.x86.avx512.prolv.d.128(<4 x i32> %0, <4 x i32> %1)
6880  %3 = bitcast <2 x i64> %__W to <4 x i32>
6881  %4 = bitcast i8 %__U to <8 x i1>
6882  %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6883  %5 = select <4 x i1> %extract.i, <4 x i32> %2, <4 x i32> %3
6884  %6 = bitcast <4 x i32> %5 to <2 x i64>
6885  ret <2 x i64> %6
6886}
6887
6888define <2 x i64> @test_mm_maskz_rolv_epi32(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) {
6889; X86-LABEL: test_mm_maskz_rolv_epi32:
6890; X86:       # %bb.0: # %entry
6891; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
6892; X86-NEXT:    kmovw %eax, %k1
6893; X86-NEXT:    vprolvd %xmm1, %xmm0, %xmm0 {%k1} {z}
6894; X86-NEXT:    retl
6895;
6896; X64-LABEL: test_mm_maskz_rolv_epi32:
6897; X64:       # %bb.0: # %entry
6898; X64-NEXT:    kmovw %edi, %k1
6899; X64-NEXT:    vprolvd %xmm1, %xmm0, %xmm0 {%k1} {z}
6900; X64-NEXT:    retq
6901entry:
6902  %0 = bitcast <2 x i64> %__A to <4 x i32>
6903  %1 = bitcast <2 x i64> %__B to <4 x i32>
6904  %2 = tail call <4 x i32> @llvm.x86.avx512.prolv.d.128(<4 x i32> %0, <4 x i32> %1)
6905  %3 = bitcast i8 %__U to <8 x i1>
6906  %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6907  %4 = select <4 x i1> %extract.i, <4 x i32> %2, <4 x i32> zeroinitializer
6908  %5 = bitcast <4 x i32> %4 to <2 x i64>
6909  ret <2 x i64> %5
6910}
6911
6912define <4 x i64> @test_mm256_rolv_epi32(<4 x i64> %__A, <4 x i64> %__B) {
6913; CHECK-LABEL: test_mm256_rolv_epi32:
6914; CHECK:       # %bb.0: # %entry
6915; CHECK-NEXT:    vprolvd %ymm1, %ymm0, %ymm0
6916; CHECK-NEXT:    ret{{[l|q]}}
6917entry:
6918  %0 = bitcast <4 x i64> %__A to <8 x i32>
6919  %1 = bitcast <4 x i64> %__B to <8 x i32>
6920  %2 = tail call <8 x i32> @llvm.x86.avx512.prolv.d.256(<8 x i32> %0, <8 x i32> %1)
6921  %3 = bitcast <8 x i32> %2 to <4 x i64>
6922  ret <4 x i64> %3
6923}
6924
6925define <4 x i64> @test_mm256_mask_rolv_epi32(<4 x i64> %__W, i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
6926; X86-LABEL: test_mm256_mask_rolv_epi32:
6927; X86:       # %bb.0: # %entry
6928; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
6929; X86-NEXT:    kmovw %eax, %k1
6930; X86-NEXT:    vprolvd %ymm2, %ymm1, %ymm0 {%k1}
6931; X86-NEXT:    retl
6932;
6933; X64-LABEL: test_mm256_mask_rolv_epi32:
6934; X64:       # %bb.0: # %entry
6935; X64-NEXT:    kmovw %edi, %k1
6936; X64-NEXT:    vprolvd %ymm2, %ymm1, %ymm0 {%k1}
6937; X64-NEXT:    retq
6938entry:
6939  %0 = bitcast <4 x i64> %__A to <8 x i32>
6940  %1 = bitcast <4 x i64> %__B to <8 x i32>
6941  %2 = tail call <8 x i32> @llvm.x86.avx512.prolv.d.256(<8 x i32> %0, <8 x i32> %1)
6942  %3 = bitcast <4 x i64> %__W to <8 x i32>
6943  %4 = bitcast i8 %__U to <8 x i1>
6944  %5 = select <8 x i1> %4, <8 x i32> %2, <8 x i32> %3
6945  %6 = bitcast <8 x i32> %5 to <4 x i64>
6946  ret <4 x i64> %6
6947}
6948
6949define <4 x i64> @test_mm256_maskz_rolv_epi32(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
6950; X86-LABEL: test_mm256_maskz_rolv_epi32:
6951; X86:       # %bb.0: # %entry
6952; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
6953; X86-NEXT:    kmovw %eax, %k1
6954; X86-NEXT:    vprolvd %ymm1, %ymm0, %ymm0 {%k1} {z}
6955; X86-NEXT:    retl
6956;
6957; X64-LABEL: test_mm256_maskz_rolv_epi32:
6958; X64:       # %bb.0: # %entry
6959; X64-NEXT:    kmovw %edi, %k1
6960; X64-NEXT:    vprolvd %ymm1, %ymm0, %ymm0 {%k1} {z}
6961; X64-NEXT:    retq
6962entry:
6963  %0 = bitcast <4 x i64> %__A to <8 x i32>
6964  %1 = bitcast <4 x i64> %__B to <8 x i32>
6965  %2 = tail call <8 x i32> @llvm.x86.avx512.prolv.d.256(<8 x i32> %0, <8 x i32> %1)
6966  %3 = bitcast i8 %__U to <8 x i1>
6967  %4 = select <8 x i1> %3, <8 x i32> %2, <8 x i32> zeroinitializer
6968  %5 = bitcast <8 x i32> %4 to <4 x i64>
6969  ret <4 x i64> %5
6970}
6971
6972define <2 x i64> @test_mm_rolv_epi64(<2 x i64> %__A, <2 x i64> %__B) {
6973; CHECK-LABEL: test_mm_rolv_epi64:
6974; CHECK:       # %bb.0: # %entry
6975; CHECK-NEXT:    vprolvq %xmm1, %xmm0, %xmm0
6976; CHECK-NEXT:    ret{{[l|q]}}
6977entry:
6978  %0 = tail call <2 x i64> @llvm.x86.avx512.prolv.q.128(<2 x i64> %__A, <2 x i64> %__B)
6979  ret <2 x i64> %0
6980}
6981
6982define <2 x i64> @test_mm_mask_rolv_epi64(<2 x i64> %__W, i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) {
6983; X86-LABEL: test_mm_mask_rolv_epi64:
6984; X86:       # %bb.0: # %entry
6985; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
6986; X86-NEXT:    kmovw %eax, %k1
6987; X86-NEXT:    vprolvq %xmm2, %xmm1, %xmm0 {%k1}
6988; X86-NEXT:    retl
6989;
6990; X64-LABEL: test_mm_mask_rolv_epi64:
6991; X64:       # %bb.0: # %entry
6992; X64-NEXT:    kmovw %edi, %k1
6993; X64-NEXT:    vprolvq %xmm2, %xmm1, %xmm0 {%k1}
6994; X64-NEXT:    retq
6995entry:
6996  %0 = tail call <2 x i64> @llvm.x86.avx512.prolv.q.128(<2 x i64> %__A, <2 x i64> %__B)
6997  %1 = bitcast i8 %__U to <8 x i1>
6998  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
6999  %2 = select <2 x i1> %extract.i, <2 x i64> %0, <2 x i64> %__W
7000  ret <2 x i64> %2
7001}
7002
7003define <2 x i64> @test_mm_maskz_rolv_epi64(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) {
7004; X86-LABEL: test_mm_maskz_rolv_epi64:
7005; X86:       # %bb.0: # %entry
7006; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
7007; X86-NEXT:    kmovw %eax, %k1
7008; X86-NEXT:    vprolvq %xmm1, %xmm0, %xmm0 {%k1} {z}
7009; X86-NEXT:    retl
7010;
7011; X64-LABEL: test_mm_maskz_rolv_epi64:
7012; X64:       # %bb.0: # %entry
7013; X64-NEXT:    kmovw %edi, %k1
7014; X64-NEXT:    vprolvq %xmm1, %xmm0, %xmm0 {%k1} {z}
7015; X64-NEXT:    retq
7016entry:
7017  %0 = tail call <2 x i64> @llvm.x86.avx512.prolv.q.128(<2 x i64> %__A, <2 x i64> %__B)
7018  %1 = bitcast i8 %__U to <8 x i1>
7019  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
7020  %2 = select <2 x i1> %extract.i, <2 x i64> %0, <2 x i64> zeroinitializer
7021  ret <2 x i64> %2
7022}
7023
7024define <4 x i64> @test_mm256_rolv_epi64(<4 x i64> %__A, <4 x i64> %__B) {
7025; CHECK-LABEL: test_mm256_rolv_epi64:
7026; CHECK:       # %bb.0: # %entry
7027; CHECK-NEXT:    vprolvq %ymm1, %ymm0, %ymm0
7028; CHECK-NEXT:    ret{{[l|q]}}
7029entry:
7030  %0 = tail call <4 x i64> @llvm.x86.avx512.prolv.q.256(<4 x i64> %__A, <4 x i64> %__B)
7031  ret <4 x i64> %0
7032}
7033
7034define <4 x i64> @test_mm256_mask_rolv_epi64(<4 x i64> %__W, i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
7035; X86-LABEL: test_mm256_mask_rolv_epi64:
7036; X86:       # %bb.0: # %entry
7037; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
7038; X86-NEXT:    kmovw %eax, %k1
7039; X86-NEXT:    vprolvq %ymm2, %ymm1, %ymm0 {%k1}
7040; X86-NEXT:    retl
7041;
7042; X64-LABEL: test_mm256_mask_rolv_epi64:
7043; X64:       # %bb.0: # %entry
7044; X64-NEXT:    kmovw %edi, %k1
7045; X64-NEXT:    vprolvq %ymm2, %ymm1, %ymm0 {%k1}
7046; X64-NEXT:    retq
7047entry:
7048  %0 = tail call <4 x i64> @llvm.x86.avx512.prolv.q.256(<4 x i64> %__A, <4 x i64> %__B)
7049  %1 = bitcast i8 %__U to <8 x i1>
7050  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7051  %2 = select <4 x i1> %extract.i, <4 x i64> %0, <4 x i64> %__W
7052  ret <4 x i64> %2
7053}
7054
7055define <4 x i64> @test_mm256_maskz_rolv_epi64(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
7056; X86-LABEL: test_mm256_maskz_rolv_epi64:
7057; X86:       # %bb.0: # %entry
7058; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
7059; X86-NEXT:    kmovw %eax, %k1
7060; X86-NEXT:    vprolvq %ymm1, %ymm0, %ymm0 {%k1} {z}
7061; X86-NEXT:    retl
7062;
7063; X64-LABEL: test_mm256_maskz_rolv_epi64:
7064; X64:       # %bb.0: # %entry
7065; X64-NEXT:    kmovw %edi, %k1
7066; X64-NEXT:    vprolvq %ymm1, %ymm0, %ymm0 {%k1} {z}
7067; X64-NEXT:    retq
7068entry:
7069  %0 = tail call <4 x i64> @llvm.x86.avx512.prolv.q.256(<4 x i64> %__A, <4 x i64> %__B)
7070  %1 = bitcast i8 %__U to <8 x i1>
7071  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7072  %2 = select <4 x i1> %extract.i, <4 x i64> %0, <4 x i64> zeroinitializer
7073  ret <4 x i64> %2
7074}
7075
7076define <2 x i64> @test_mm_ror_epi32(<2 x i64> %__A) {
7077; CHECK-LABEL: test_mm_ror_epi32:
7078; CHECK:       # %bb.0: # %entry
7079; CHECK-NEXT:    vprord $5, %xmm0, %xmm0
7080; CHECK-NEXT:    ret{{[l|q]}}
7081entry:
7082  %0 = bitcast <2 x i64> %__A to <4 x i32>
7083  %1 = tail call <4 x i32> @llvm.x86.avx512.pror.d.128(<4 x i32> %0, i32 5)
7084  %2 = bitcast <4 x i32> %1 to <2 x i64>
7085  ret <2 x i64> %2
7086}
7087
7088declare <4 x i32> @llvm.x86.avx512.pror.d.128(<4 x i32>, i32)
7089
7090define <2 x i64> @test_mm_mask_ror_epi32(<2 x i64> %__W, i8 zeroext %__U, <2 x i64> %__A) {
7091; X86-LABEL: test_mm_mask_ror_epi32:
7092; X86:       # %bb.0: # %entry
7093; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
7094; X86-NEXT:    kmovw %eax, %k1
7095; X86-NEXT:    vprord $5, %xmm1, %xmm0 {%k1}
7096; X86-NEXT:    retl
7097;
7098; X64-LABEL: test_mm_mask_ror_epi32:
7099; X64:       # %bb.0: # %entry
7100; X64-NEXT:    kmovw %edi, %k1
7101; X64-NEXT:    vprord $5, %xmm1, %xmm0 {%k1}
7102; X64-NEXT:    retq
7103entry:
7104  %0 = bitcast <2 x i64> %__A to <4 x i32>
7105  %1 = tail call <4 x i32> @llvm.x86.avx512.pror.d.128(<4 x i32> %0, i32 5)
7106  %2 = bitcast <2 x i64> %__W to <4 x i32>
7107  %3 = bitcast i8 %__U to <8 x i1>
7108  %extract = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7109  %4 = select <4 x i1> %extract, <4 x i32> %1, <4 x i32> %2
7110  %5 = bitcast <4 x i32> %4 to <2 x i64>
7111  ret <2 x i64> %5
7112}
7113
7114define <2 x i64> @test_mm_maskz_ror_epi32(i8 zeroext %__U, <2 x i64> %__A) {
7115; X86-LABEL: test_mm_maskz_ror_epi32:
7116; X86:       # %bb.0: # %entry
7117; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
7118; X86-NEXT:    kmovw %eax, %k1
7119; X86-NEXT:    vprord $5, %xmm0, %xmm0 {%k1} {z}
7120; X86-NEXT:    retl
7121;
7122; X64-LABEL: test_mm_maskz_ror_epi32:
7123; X64:       # %bb.0: # %entry
7124; X64-NEXT:    kmovw %edi, %k1
7125; X64-NEXT:    vprord $5, %xmm0, %xmm0 {%k1} {z}
7126; X64-NEXT:    retq
7127entry:
7128  %0 = bitcast <2 x i64> %__A to <4 x i32>
7129  %1 = tail call <4 x i32> @llvm.x86.avx512.pror.d.128(<4 x i32> %0, i32 5)
7130  %2 = bitcast i8 %__U to <8 x i1>
7131  %extract = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7132  %3 = select <4 x i1> %extract, <4 x i32> %1, <4 x i32> zeroinitializer
7133  %4 = bitcast <4 x i32> %3 to <2 x i64>
7134  ret <2 x i64> %4
7135}
7136
7137define <4 x i64> @test_mm256_ror_epi32(<4 x i64> %__A) {
7138; CHECK-LABEL: test_mm256_ror_epi32:
7139; CHECK:       # %bb.0: # %entry
7140; CHECK-NEXT:    vprord $5, %ymm0, %ymm0
7141; CHECK-NEXT:    ret{{[l|q]}}
7142entry:
7143  %0 = bitcast <4 x i64> %__A to <8 x i32>
7144  %1 = tail call <8 x i32> @llvm.x86.avx512.pror.d.256(<8 x i32> %0, i32 5)
7145  %2 = bitcast <8 x i32> %1 to <4 x i64>
7146  ret <4 x i64> %2
7147}
7148
7149declare <8 x i32> @llvm.x86.avx512.pror.d.256(<8 x i32>, i32)
7150
7151define <4 x i64> @test_mm256_mask_ror_epi32(<4 x i64> %__W, i8 zeroext %__U, <4 x i64> %__A) {
7152; X86-LABEL: test_mm256_mask_ror_epi32:
7153; X86:       # %bb.0: # %entry
7154; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
7155; X86-NEXT:    kmovw %eax, %k1
7156; X86-NEXT:    vprord $5, %ymm1, %ymm0 {%k1}
7157; X86-NEXT:    retl
7158;
7159; X64-LABEL: test_mm256_mask_ror_epi32:
7160; X64:       # %bb.0: # %entry
7161; X64-NEXT:    kmovw %edi, %k1
7162; X64-NEXT:    vprord $5, %ymm1, %ymm0 {%k1}
7163; X64-NEXT:    retq
7164entry:
7165  %0 = bitcast <4 x i64> %__A to <8 x i32>
7166  %1 = tail call <8 x i32> @llvm.x86.avx512.pror.d.256(<8 x i32> %0, i32 5)
7167  %2 = bitcast <4 x i64> %__W to <8 x i32>
7168  %3 = bitcast i8 %__U to <8 x i1>
7169  %4 = select <8 x i1> %3, <8 x i32> %1, <8 x i32> %2
7170  %5 = bitcast <8 x i32> %4 to <4 x i64>
7171  ret <4 x i64> %5
7172}
7173
7174define <4 x i64> @test_mm256_maskz_ror_epi32(i8 zeroext %__U, <4 x i64> %__A) {
7175; X86-LABEL: test_mm256_maskz_ror_epi32:
7176; X86:       # %bb.0: # %entry
7177; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
7178; X86-NEXT:    kmovw %eax, %k1
7179; X86-NEXT:    vprord $5, %ymm0, %ymm0 {%k1} {z}
7180; X86-NEXT:    retl
7181;
7182; X64-LABEL: test_mm256_maskz_ror_epi32:
7183; X64:       # %bb.0: # %entry
7184; X64-NEXT:    kmovw %edi, %k1
7185; X64-NEXT:    vprord $5, %ymm0, %ymm0 {%k1} {z}
7186; X64-NEXT:    retq
7187entry:
7188  %0 = bitcast <4 x i64> %__A to <8 x i32>
7189  %1 = tail call <8 x i32> @llvm.x86.avx512.pror.d.256(<8 x i32> %0, i32 5)
7190  %2 = bitcast i8 %__U to <8 x i1>
7191  %3 = select <8 x i1> %2, <8 x i32> %1, <8 x i32> zeroinitializer
7192  %4 = bitcast <8 x i32> %3 to <4 x i64>
7193  ret <4 x i64> %4
7194}
7195
7196define <2 x i64> @test_mm_ror_epi64(<2 x i64> %__A) {
7197; CHECK-LABEL: test_mm_ror_epi64:
7198; CHECK:       # %bb.0: # %entry
7199; CHECK-NEXT:    vprorq $5, %xmm0, %xmm0
7200; CHECK-NEXT:    ret{{[l|q]}}
7201entry:
7202  %0 = tail call <2 x i64> @llvm.x86.avx512.pror.q.128(<2 x i64> %__A, i32 5)
7203  ret <2 x i64> %0
7204}
7205
7206declare <2 x i64> @llvm.x86.avx512.pror.q.128(<2 x i64>, i32)
7207
7208define <2 x i64> @test_mm_mask_ror_epi64(<2 x i64> %__W, i8 zeroext %__U, <2 x i64> %__A) {
7209; X86-LABEL: test_mm_mask_ror_epi64:
7210; X86:       # %bb.0: # %entry
7211; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
7212; X86-NEXT:    kmovw %eax, %k1
7213; X86-NEXT:    vprorq $5, %xmm1, %xmm0 {%k1}
7214; X86-NEXT:    retl
7215;
7216; X64-LABEL: test_mm_mask_ror_epi64:
7217; X64:       # %bb.0: # %entry
7218; X64-NEXT:    kmovw %edi, %k1
7219; X64-NEXT:    vprorq $5, %xmm1, %xmm0 {%k1}
7220; X64-NEXT:    retq
7221entry:
7222  %0 = tail call <2 x i64> @llvm.x86.avx512.pror.q.128(<2 x i64> %__A, i32 5)
7223  %1 = bitcast i8 %__U to <8 x i1>
7224  %extract = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
7225  %2 = select <2 x i1> %extract, <2 x i64> %0, <2 x i64> %__W
7226  ret <2 x i64> %2
7227}
7228
7229define <2 x i64> @test_mm_maskz_ror_epi64(i8 zeroext %__U, <2 x i64> %__A) {
7230; X86-LABEL: test_mm_maskz_ror_epi64:
7231; X86:       # %bb.0: # %entry
7232; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
7233; X86-NEXT:    kmovw %eax, %k1
7234; X86-NEXT:    vprorq $5, %xmm0, %xmm0 {%k1} {z}
7235; X86-NEXT:    retl
7236;
7237; X64-LABEL: test_mm_maskz_ror_epi64:
7238; X64:       # %bb.0: # %entry
7239; X64-NEXT:    kmovw %edi, %k1
7240; X64-NEXT:    vprorq $5, %xmm0, %xmm0 {%k1} {z}
7241; X64-NEXT:    retq
7242entry:
7243  %0 = tail call <2 x i64> @llvm.x86.avx512.pror.q.128(<2 x i64> %__A, i32 5)
7244  %1 = bitcast i8 %__U to <8 x i1>
7245  %extract = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
7246  %2 = select <2 x i1> %extract, <2 x i64> %0, <2 x i64> zeroinitializer
7247  ret <2 x i64> %2
7248}
7249
7250define <4 x i64> @test_mm256_ror_epi64(<4 x i64> %__A) {
7251; CHECK-LABEL: test_mm256_ror_epi64:
7252; CHECK:       # %bb.0: # %entry
7253; CHECK-NEXT:    vprorq $5, %ymm0, %ymm0
7254; CHECK-NEXT:    ret{{[l|q]}}
7255entry:
7256  %0 = tail call <4 x i64> @llvm.x86.avx512.pror.q.256(<4 x i64> %__A, i32 5)
7257  ret <4 x i64> %0
7258}
7259
7260declare <4 x i64> @llvm.x86.avx512.pror.q.256(<4 x i64>, i32)
7261
7262define <4 x i64> @test_mm256_mask_ror_epi64(<4 x i64> %__W, i8 zeroext %__U, <4 x i64> %__A) {
7263; X86-LABEL: test_mm256_mask_ror_epi64:
7264; X86:       # %bb.0: # %entry
7265; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
7266; X86-NEXT:    kmovw %eax, %k1
7267; X86-NEXT:    vprorq $5, %ymm1, %ymm0 {%k1}
7268; X86-NEXT:    retl
7269;
7270; X64-LABEL: test_mm256_mask_ror_epi64:
7271; X64:       # %bb.0: # %entry
7272; X64-NEXT:    kmovw %edi, %k1
7273; X64-NEXT:    vprorq $5, %ymm1, %ymm0 {%k1}
7274; X64-NEXT:    retq
7275entry:
7276  %0 = tail call <4 x i64> @llvm.x86.avx512.pror.q.256(<4 x i64> %__A, i32 5)
7277  %1 = bitcast i8 %__U to <8 x i1>
7278  %extract = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7279  %2 = select <4 x i1> %extract, <4 x i64> %0, <4 x i64> %__W
7280  ret <4 x i64> %2
7281}
7282
7283define <4 x i64> @test_mm256_maskz_ror_epi64(i8 zeroext %__U, <4 x i64> %__A) {
7284; X86-LABEL: test_mm256_maskz_ror_epi64:
7285; X86:       # %bb.0: # %entry
7286; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
7287; X86-NEXT:    kmovw %eax, %k1
7288; X86-NEXT:    vprorq $5, %ymm0, %ymm0 {%k1} {z}
7289; X86-NEXT:    retl
7290;
7291; X64-LABEL: test_mm256_maskz_ror_epi64:
7292; X64:       # %bb.0: # %entry
7293; X64-NEXT:    kmovw %edi, %k1
7294; X64-NEXT:    vprorq $5, %ymm0, %ymm0 {%k1} {z}
7295; X64-NEXT:    retq
7296entry:
7297  %0 = tail call <4 x i64> @llvm.x86.avx512.pror.q.256(<4 x i64> %__A, i32 5)
7298  %1 = bitcast i8 %__U to <8 x i1>
7299  %extract = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7300  %2 = select <4 x i1> %extract, <4 x i64> %0, <4 x i64> zeroinitializer
7301  ret <4 x i64> %2
7302}
7303
7304define <2 x i64> @test_mm_rorv_epi32(<2 x i64> %__A, <2 x i64> %__B) {
7305; CHECK-LABEL: test_mm_rorv_epi32:
7306; CHECK:       # %bb.0: # %entry
7307; CHECK-NEXT:    vprorvd %xmm1, %xmm0, %xmm0
7308; CHECK-NEXT:    ret{{[l|q]}}
7309entry:
7310  %0 = bitcast <2 x i64> %__A to <4 x i32>
7311  %1 = bitcast <2 x i64> %__B to <4 x i32>
7312  %2 = tail call <4 x i32> @llvm.x86.avx512.prorv.d.128(<4 x i32> %0, <4 x i32> %1)
7313  %3 = bitcast <4 x i32> %2 to <2 x i64>
7314  ret <2 x i64> %3
7315}
7316
7317define <2 x i64> @test_mm_mask_rorv_epi32(<2 x i64> %__W, i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) {
7318; X86-LABEL: test_mm_mask_rorv_epi32:
7319; X86:       # %bb.0: # %entry
7320; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
7321; X86-NEXT:    kmovw %eax, %k1
7322; X86-NEXT:    vprorvd %xmm2, %xmm1, %xmm0 {%k1}
7323; X86-NEXT:    retl
7324;
7325; X64-LABEL: test_mm_mask_rorv_epi32:
7326; X64:       # %bb.0: # %entry
7327; X64-NEXT:    kmovw %edi, %k1
7328; X64-NEXT:    vprorvd %xmm2, %xmm1, %xmm0 {%k1}
7329; X64-NEXT:    retq
7330entry:
7331  %0 = bitcast <2 x i64> %__A to <4 x i32>
7332  %1 = bitcast <2 x i64> %__B to <4 x i32>
7333  %2 = tail call <4 x i32> @llvm.x86.avx512.prorv.d.128(<4 x i32> %0, <4 x i32> %1)
7334  %3 = bitcast <2 x i64> %__W to <4 x i32>
7335  %4 = bitcast i8 %__U to <8 x i1>
7336  %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7337  %5 = select <4 x i1> %extract.i, <4 x i32> %2, <4 x i32> %3
7338  %6 = bitcast <4 x i32> %5 to <2 x i64>
7339  ret <2 x i64> %6
7340}
7341
7342define <2 x i64> @test_mm_maskz_rorv_epi32(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) {
7343; X86-LABEL: test_mm_maskz_rorv_epi32:
7344; X86:       # %bb.0: # %entry
7345; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
7346; X86-NEXT:    kmovw %eax, %k1
7347; X86-NEXT:    vprorvd %xmm1, %xmm0, %xmm0 {%k1} {z}
7348; X86-NEXT:    retl
7349;
7350; X64-LABEL: test_mm_maskz_rorv_epi32:
7351; X64:       # %bb.0: # %entry
7352; X64-NEXT:    kmovw %edi, %k1
7353; X64-NEXT:    vprorvd %xmm1, %xmm0, %xmm0 {%k1} {z}
7354; X64-NEXT:    retq
7355entry:
7356  %0 = bitcast <2 x i64> %__A to <4 x i32>
7357  %1 = bitcast <2 x i64> %__B to <4 x i32>
7358  %2 = tail call <4 x i32> @llvm.x86.avx512.prorv.d.128(<4 x i32> %0, <4 x i32> %1)
7359  %3 = bitcast i8 %__U to <8 x i1>
7360  %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7361  %4 = select <4 x i1> %extract.i, <4 x i32> %2, <4 x i32> zeroinitializer
7362  %5 = bitcast <4 x i32> %4 to <2 x i64>
7363  ret <2 x i64> %5
7364}
7365
7366define <4 x i64> @test_mm256_rorv_epi32(<4 x i64> %__A, <4 x i64> %__B) {
7367; CHECK-LABEL: test_mm256_rorv_epi32:
7368; CHECK:       # %bb.0: # %entry
7369; CHECK-NEXT:    vprorvd %ymm1, %ymm0, %ymm0
7370; CHECK-NEXT:    ret{{[l|q]}}
7371entry:
7372  %0 = bitcast <4 x i64> %__A to <8 x i32>
7373  %1 = bitcast <4 x i64> %__B to <8 x i32>
7374  %2 = tail call <8 x i32> @llvm.x86.avx512.prorv.d.256(<8 x i32> %0, <8 x i32> %1)
7375  %3 = bitcast <8 x i32> %2 to <4 x i64>
7376  ret <4 x i64> %3
7377}
7378
7379define <4 x i64> @test_mm256_mask_rorv_epi32(<4 x i64> %__W, i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
7380; X86-LABEL: test_mm256_mask_rorv_epi32:
7381; X86:       # %bb.0: # %entry
7382; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
7383; X86-NEXT:    kmovw %eax, %k1
7384; X86-NEXT:    vprorvd %ymm2, %ymm1, %ymm0 {%k1}
7385; X86-NEXT:    retl
7386;
7387; X64-LABEL: test_mm256_mask_rorv_epi32:
7388; X64:       # %bb.0: # %entry
7389; X64-NEXT:    kmovw %edi, %k1
7390; X64-NEXT:    vprorvd %ymm2, %ymm1, %ymm0 {%k1}
7391; X64-NEXT:    retq
7392entry:
7393  %0 = bitcast <4 x i64> %__A to <8 x i32>
7394  %1 = bitcast <4 x i64> %__B to <8 x i32>
7395  %2 = tail call <8 x i32> @llvm.x86.avx512.prorv.d.256(<8 x i32> %0, <8 x i32> %1)
7396  %3 = bitcast <4 x i64> %__W to <8 x i32>
7397  %4 = bitcast i8 %__U to <8 x i1>
7398  %5 = select <8 x i1> %4, <8 x i32> %2, <8 x i32> %3
7399  %6 = bitcast <8 x i32> %5 to <4 x i64>
7400  ret <4 x i64> %6
7401}
7402
7403define <4 x i64> @test_mm256_maskz_rorv_epi32(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
7404; X86-LABEL: test_mm256_maskz_rorv_epi32:
7405; X86:       # %bb.0: # %entry
7406; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
7407; X86-NEXT:    kmovw %eax, %k1
7408; X86-NEXT:    vprorvd %ymm1, %ymm0, %ymm0 {%k1} {z}
7409; X86-NEXT:    retl
7410;
7411; X64-LABEL: test_mm256_maskz_rorv_epi32:
7412; X64:       # %bb.0: # %entry
7413; X64-NEXT:    kmovw %edi, %k1
7414; X64-NEXT:    vprorvd %ymm1, %ymm0, %ymm0 {%k1} {z}
7415; X64-NEXT:    retq
7416entry:
7417  %0 = bitcast <4 x i64> %__A to <8 x i32>
7418  %1 = bitcast <4 x i64> %__B to <8 x i32>
7419  %2 = tail call <8 x i32> @llvm.x86.avx512.prorv.d.256(<8 x i32> %0, <8 x i32> %1)
7420  %3 = bitcast i8 %__U to <8 x i1>
7421  %4 = select <8 x i1> %3, <8 x i32> %2, <8 x i32> zeroinitializer
7422  %5 = bitcast <8 x i32> %4 to <4 x i64>
7423  ret <4 x i64> %5
7424}
7425
7426define <2 x i64> @test_mm_rorv_epi64(<2 x i64> %__A, <2 x i64> %__B) {
7427; CHECK-LABEL: test_mm_rorv_epi64:
7428; CHECK:       # %bb.0: # %entry
7429; CHECK-NEXT:    vprorvq %xmm1, %xmm0, %xmm0
7430; CHECK-NEXT:    ret{{[l|q]}}
7431entry:
7432  %0 = tail call <2 x i64> @llvm.x86.avx512.prorv.q.128(<2 x i64> %__A, <2 x i64> %__B)
7433  ret <2 x i64> %0
7434}
7435
7436define <2 x i64> @test_mm_mask_rorv_epi64(<2 x i64> %__W, i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) {
7437; X86-LABEL: test_mm_mask_rorv_epi64:
7438; X86:       # %bb.0: # %entry
7439; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
7440; X86-NEXT:    kmovw %eax, %k1
7441; X86-NEXT:    vprorvq %xmm2, %xmm1, %xmm0 {%k1}
7442; X86-NEXT:    retl
7443;
7444; X64-LABEL: test_mm_mask_rorv_epi64:
7445; X64:       # %bb.0: # %entry
7446; X64-NEXT:    kmovw %edi, %k1
7447; X64-NEXT:    vprorvq %xmm2, %xmm1, %xmm0 {%k1}
7448; X64-NEXT:    retq
7449entry:
7450  %0 = tail call <2 x i64> @llvm.x86.avx512.prorv.q.128(<2 x i64> %__A, <2 x i64> %__B)
7451  %1 = bitcast i8 %__U to <8 x i1>
7452  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
7453  %2 = select <2 x i1> %extract.i, <2 x i64> %0, <2 x i64> %__W
7454  ret <2 x i64> %2
7455}
7456
7457define <2 x i64> @test_mm_maskz_rorv_epi64(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) {
7458; X86-LABEL: test_mm_maskz_rorv_epi64:
7459; X86:       # %bb.0: # %entry
7460; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
7461; X86-NEXT:    kmovw %eax, %k1
7462; X86-NEXT:    vprorvq %xmm1, %xmm0, %xmm0 {%k1} {z}
7463; X86-NEXT:    retl
7464;
7465; X64-LABEL: test_mm_maskz_rorv_epi64:
7466; X64:       # %bb.0: # %entry
7467; X64-NEXT:    kmovw %edi, %k1
7468; X64-NEXT:    vprorvq %xmm1, %xmm0, %xmm0 {%k1} {z}
7469; X64-NEXT:    retq
7470entry:
7471  %0 = tail call <2 x i64> @llvm.x86.avx512.prorv.q.128(<2 x i64> %__A, <2 x i64> %__B)
7472  %1 = bitcast i8 %__U to <8 x i1>
7473  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
7474  %2 = select <2 x i1> %extract.i, <2 x i64> %0, <2 x i64> zeroinitializer
7475  ret <2 x i64> %2
7476}
7477
7478define <4 x i64> @test_mm256_rorv_epi64(<4 x i64> %__A, <4 x i64> %__B) {
7479; CHECK-LABEL: test_mm256_rorv_epi64:
7480; CHECK:       # %bb.0: # %entry
7481; CHECK-NEXT:    vprorvq %ymm1, %ymm0, %ymm0
7482; CHECK-NEXT:    ret{{[l|q]}}
7483entry:
7484  %0 = tail call <4 x i64> @llvm.x86.avx512.prorv.q.256(<4 x i64> %__A, <4 x i64> %__B)
7485  ret <4 x i64> %0
7486}
7487
7488define <4 x i64> @test_mm256_mask_rorv_epi64(<4 x i64> %__W, i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
7489; X86-LABEL: test_mm256_mask_rorv_epi64:
7490; X86:       # %bb.0: # %entry
7491; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
7492; X86-NEXT:    kmovw %eax, %k1
7493; X86-NEXT:    vprorvq %ymm2, %ymm1, %ymm0 {%k1}
7494; X86-NEXT:    retl
7495;
7496; X64-LABEL: test_mm256_mask_rorv_epi64:
7497; X64:       # %bb.0: # %entry
7498; X64-NEXT:    kmovw %edi, %k1
7499; X64-NEXT:    vprorvq %ymm2, %ymm1, %ymm0 {%k1}
7500; X64-NEXT:    retq
7501entry:
7502  %0 = tail call <4 x i64> @llvm.x86.avx512.prorv.q.256(<4 x i64> %__A, <4 x i64> %__B)
7503  %1 = bitcast i8 %__U to <8 x i1>
7504  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7505  %2 = select <4 x i1> %extract.i, <4 x i64> %0, <4 x i64> %__W
7506  ret <4 x i64> %2
7507}
7508
7509define <4 x i64> @test_mm256_maskz_rorv_epi64(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
7510; X86-LABEL: test_mm256_maskz_rorv_epi64:
7511; X86:       # %bb.0: # %entry
7512; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
7513; X86-NEXT:    kmovw %eax, %k1
7514; X86-NEXT:    vprorvq %ymm1, %ymm0, %ymm0 {%k1} {z}
7515; X86-NEXT:    retl
7516;
7517; X64-LABEL: test_mm256_maskz_rorv_epi64:
7518; X64:       # %bb.0: # %entry
7519; X64-NEXT:    kmovw %edi, %k1
7520; X64-NEXT:    vprorvq %ymm1, %ymm0, %ymm0 {%k1} {z}
7521; X64-NEXT:    retq
7522entry:
7523  %0 = tail call <4 x i64> @llvm.x86.avx512.prorv.q.256(<4 x i64> %__A, <4 x i64> %__B)
7524  %1 = bitcast i8 %__U to <8 x i1>
7525  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7526  %2 = select <4 x i1> %extract.i, <4 x i64> %0, <4 x i64> zeroinitializer
7527  ret <4 x i64> %2
7528}
7529
7530declare <4 x float> @llvm.x86.sse2.cvtdq2ps(<4 x i32>)
7531declare <8 x float> @llvm.x86.avx.cvtdq2.ps.256(<8 x i32>)
7532declare <4 x i32> @llvm.x86.avx512.mask.cvtpd2dq.128(<2 x double>, <4 x i32>, i8)
7533declare <4 x i32> @llvm.x86.avx.cvt.pd2dq.256(<4 x double>)
7534declare <4 x float> @llvm.x86.avx512.mask.cvtpd2ps(<2 x double>, <4 x float>, i8)
7535declare <4 x float> @llvm.x86.avx.cvt.pd2.ps.256(<4 x double>)
7536declare <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.128(<2 x double>, <4 x i32>, i8)
7537declare <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.256(<4 x double>, <4 x i32>, i8)
7538declare <4 x i32> @llvm.x86.sse2.cvtps2dq(<4 x float>)
7539declare <8 x i32> @llvm.x86.avx.cvt.ps2dq.256(<8 x float>)
7540declare <4 x i32> @llvm.x86.avx512.mask.cvtps2udq.128(<4 x float>, <4 x i32>, i8)
7541declare <8 x i32> @llvm.x86.avx512.mask.cvtps2udq.256(<8 x float>, <8 x i32>, i8)
7542declare <4 x i32> @llvm.x86.avx512.mask.cvttpd2dq.128(<2 x double>, <4 x i32>, i8)
7543declare <4 x i32> @llvm.x86.avx.cvtt.pd2dq.256(<4 x double>)
7544declare <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.128(<2 x double>, <4 x i32>, i8)
7545declare <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.256(<4 x double>, <4 x i32>, i8)
7546declare <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float>)
7547declare <8 x i32> @llvm.x86.avx.cvtt.ps2dq.256(<8 x float>)
7548declare <4 x i32> @llvm.x86.avx512.mask.cvttps2udq.128(<4 x float>, <4 x i32>, i8)
7549declare <8 x i32> @llvm.x86.avx512.mask.cvttps2udq.256(<8 x float>, <8 x i32>, i8)
7550declare <8 x i16> @llvm.x86.avx512.mask.pmov.dw.256(<8 x i32>, <8 x i16>, i8)
7551declare <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32>, <4 x i32>, <4 x i32>)
7552declare <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32>, <8 x i32>, <8 x i32>)
7553declare <2 x double> @llvm.x86.avx512.vpermi2var.pd.128(<2 x double>, <2 x i64>, <2 x double>)
7554declare <4 x double> @llvm.x86.avx512.vpermi2var.pd.256(<4 x double>, <4 x i64>, <4 x double>)
7555declare <4 x float> @llvm.x86.avx512.vpermi2var.ps.128(<4 x float>, <4 x i32>, <4 x float>)
7556declare <8 x float> @llvm.x86.avx512.vpermi2var.ps.256(<8 x float>, <8 x i32>, <8 x float>)
7557declare <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64>, <2 x i64>, <2 x i64>)
7558declare <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64>, <4 x i64>, <4 x i64>)
7559declare <2 x double> @llvm.masked.expandload.v2f64(double*, <2 x i1>, <2 x double>)
7560declare <4 x double> @llvm.masked.expandload.v4f64(double*, <4 x i1>, <4 x double>)
7561declare <2 x i64> @llvm.masked.expandload.v2i64(i64*, <2 x i1>, <2 x i64>)
7562declare <4 x i64> @llvm.masked.expandload.v4i64(i64*, <4 x i1>, <4 x i64>)
7563declare <4 x float> @llvm.masked.expandload.v4f32(float*, <4 x i1>, <4 x float>)
7564declare <8 x float> @llvm.masked.expandload.v8f32(float*, <8 x i1>, <8 x float>)
7565declare <4 x i32> @llvm.masked.expandload.v4i32(i32*, <4 x i1>, <4 x i32>)
7566declare <8 x i32> @llvm.masked.expandload.v8i32(i32*, <8 x i1>, <8 x i32>)
7567declare void @llvm.masked.compressstore.v2f64(<2 x double>, double*, <2 x i1>)
7568declare void @llvm.masked.compressstore.v4f64(<4 x double>, double*, <4 x i1>)
7569declare void @llvm.masked.compressstore.v2i64(<2 x i64>, i64*, <2 x i1>)
7570declare void @llvm.masked.compressstore.v4i64(<4 x i64>, i64*, <4 x i1>)
7571declare void @llvm.masked.compressstore.v4f32(<4 x float>, float*, <4 x i1>)
7572declare void @llvm.masked.compressstore.v8f32(<8 x float>, float*, <8 x i1>)
7573declare void @llvm.masked.compressstore.v4i32(<4 x i32>, i32*, <4 x i1>)
7574declare void @llvm.masked.compressstore.v8i32(<8 x i32>, i32*, <8 x i1>)
7575declare <4 x i32> @llvm.x86.avx512.prolv.d.128(<4 x i32>, <4 x i32>)
7576declare <8 x i32> @llvm.x86.avx512.prolv.d.256(<8 x i32>, <8 x i32>)
7577declare <2 x i64> @llvm.x86.avx512.prolv.q.128(<2 x i64>, <2 x i64>)
7578declare <4 x i64> @llvm.x86.avx512.prolv.q.256(<4 x i64>, <4 x i64>)
7579declare <4 x i32> @llvm.x86.avx512.prorv.d.128(<4 x i32>, <4 x i32>)
7580declare <8 x i32> @llvm.x86.avx512.prorv.d.256(<8 x i32>, <8 x i32>)
7581declare <2 x i64> @llvm.x86.avx512.prorv.q.128(<2 x i64>, <2 x i64>)
7582declare <4 x i64> @llvm.x86.avx512.prorv.q.256(<4 x i64>, <4 x i64>)
7583
7584!0 = !{i32 1}
7585