1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=CHECK,X86
3; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=CHECK,X64
4
5; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/avx512vl-builtins.c
6
7define <4 x float> @test_mm_mask_cvtepi32_ps(<4 x float> %__W, i8 zeroext %__U, <2 x i64> %__A) {
8; X86-LABEL: test_mm_mask_cvtepi32_ps:
9; X86:       # %bb.0: # %entry
10; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
11; X86-NEXT:    kmovw %eax, %k1
12; X86-NEXT:    vcvtdq2ps %xmm1, %xmm0 {%k1}
13; X86-NEXT:    retl
14;
15; X64-LABEL: test_mm_mask_cvtepi32_ps:
16; X64:       # %bb.0: # %entry
17; X64-NEXT:    kmovw %edi, %k1
18; X64-NEXT:    vcvtdq2ps %xmm1, %xmm0 {%k1}
19; X64-NEXT:    retq
20entry:
21  %0 = bitcast <2 x i64> %__A to <4 x i32>
22  %conv.i.i = sitofp <4 x i32> %0 to <4 x float>
23  %1 = bitcast i8 %__U to <8 x i1>
24  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
25  %2 = select <4 x i1> %extract.i, <4 x float> %conv.i.i, <4 x float> %__W
26  ret <4 x float> %2
27}
28
29define <4 x float> @test_mm_maskz_cvtepi32_ps(i8 zeroext %__U, <2 x i64> %__A) {
30; X86-LABEL: test_mm_maskz_cvtepi32_ps:
31; X86:       # %bb.0: # %entry
32; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
33; X86-NEXT:    kmovw %eax, %k1
34; X86-NEXT:    vcvtdq2ps %xmm0, %xmm0 {%k1} {z}
35; X86-NEXT:    retl
36;
37; X64-LABEL: test_mm_maskz_cvtepi32_ps:
38; X64:       # %bb.0: # %entry
39; X64-NEXT:    kmovw %edi, %k1
40; X64-NEXT:    vcvtdq2ps %xmm0, %xmm0 {%k1} {z}
41; X64-NEXT:    retq
42entry:
43  %0 = bitcast <2 x i64> %__A to <4 x i32>
44  %conv.i.i = sitofp <4 x i32> %0 to <4 x float>
45  %1 = bitcast i8 %__U to <8 x i1>
46  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
47  %2 = select <4 x i1> %extract.i, <4 x float> %conv.i.i, <4 x float> zeroinitializer
48  ret <4 x float> %2
49}
50
51define <8 x float> @test_mm256_mask_cvtepi32_ps(<8 x float> %__W, i8 zeroext %__U, <4 x i64> %__A) {
52; X86-LABEL: test_mm256_mask_cvtepi32_ps:
53; X86:       # %bb.0: # %entry
54; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
55; X86-NEXT:    kmovw %eax, %k1
56; X86-NEXT:    vcvtdq2ps %ymm1, %ymm0 {%k1}
57; X86-NEXT:    retl
58;
59; X64-LABEL: test_mm256_mask_cvtepi32_ps:
60; X64:       # %bb.0: # %entry
61; X64-NEXT:    kmovw %edi, %k1
62; X64-NEXT:    vcvtdq2ps %ymm1, %ymm0 {%k1}
63; X64-NEXT:    retq
64entry:
65  %0 = bitcast <4 x i64> %__A to <8 x i32>
66  %conv.i.i = sitofp <8 x i32> %0 to <8 x float>
67  %1 = bitcast i8 %__U to <8 x i1>
68  %2 = select <8 x i1> %1, <8 x float> %conv.i.i, <8 x float> %__W
69  ret <8 x float> %2
70}
71
72define <8 x float> @test_mm256_maskz_cvtepi32_ps(i8 zeroext %__U, <4 x i64> %__A) {
73; X86-LABEL: test_mm256_maskz_cvtepi32_ps:
74; X86:       # %bb.0: # %entry
75; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
76; X86-NEXT:    kmovw %eax, %k1
77; X86-NEXT:    vcvtdq2ps %ymm0, %ymm0 {%k1} {z}
78; X86-NEXT:    retl
79;
80; X64-LABEL: test_mm256_maskz_cvtepi32_ps:
81; X64:       # %bb.0: # %entry
82; X64-NEXT:    kmovw %edi, %k1
83; X64-NEXT:    vcvtdq2ps %ymm0, %ymm0 {%k1} {z}
84; X64-NEXT:    retq
85entry:
86  %0 = bitcast <4 x i64> %__A to <8 x i32>
87  %conv.i.i = sitofp <8 x i32> %0 to <8 x float>
88  %1 = bitcast i8 %__U to <8 x i1>
89  %2 = select <8 x i1> %1, <8 x float> %conv.i.i, <8 x float> zeroinitializer
90  ret <8 x float> %2
91}
92
93define <2 x i64> @test_mm_mask_cvtpd_epi32(<2 x i64> %__W, i8 zeroext %__U, <2 x double> %__A) {
94; X86-LABEL: test_mm_mask_cvtpd_epi32:
95; X86:       # %bb.0: # %entry
96; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
97; X86-NEXT:    kmovw %eax, %k1
98; X86-NEXT:    vcvtpd2dq %xmm1, %xmm0 {%k1}
99; X86-NEXT:    retl
100;
101; X64-LABEL: test_mm_mask_cvtpd_epi32:
102; X64:       # %bb.0: # %entry
103; X64-NEXT:    kmovw %edi, %k1
104; X64-NEXT:    vcvtpd2dq %xmm1, %xmm0 {%k1}
105; X64-NEXT:    retq
106entry:
107  %0 = bitcast <2 x i64> %__W to <4 x i32>
108  %1 = tail call <4 x i32> @llvm.x86.avx512.mask.cvtpd2dq.128(<2 x double> %__A, <4 x i32> %0, i8 %__U) #8
109  %2 = bitcast <4 x i32> %1 to <2 x i64>
110  ret <2 x i64> %2
111}
112
113define <2 x i64> @test_mm_maskz_cvtpd_epi32(i8 zeroext %__U, <2 x double> %__A) {
114; X86-LABEL: test_mm_maskz_cvtpd_epi32:
115; X86:       # %bb.0: # %entry
116; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
117; X86-NEXT:    kmovw %eax, %k1
118; X86-NEXT:    vcvtpd2dq %xmm0, %xmm0 {%k1} {z}
119; X86-NEXT:    retl
120;
121; X64-LABEL: test_mm_maskz_cvtpd_epi32:
122; X64:       # %bb.0: # %entry
123; X64-NEXT:    kmovw %edi, %k1
124; X64-NEXT:    vcvtpd2dq %xmm0, %xmm0 {%k1} {z}
125; X64-NEXT:    retq
126entry:
127  %0 = tail call <4 x i32> @llvm.x86.avx512.mask.cvtpd2dq.128(<2 x double> %__A, <4 x i32> zeroinitializer, i8 %__U) #8
128  %1 = bitcast <4 x i32> %0 to <2 x i64>
129  ret <2 x i64> %1
130}
131
132define <2 x i64> @test_mm256_mask_cvtpd_epi32(<2 x i64> %__W, i8 zeroext %__U, <4 x double> %__A) {
133; X86-LABEL: test_mm256_mask_cvtpd_epi32:
134; X86:       # %bb.0: # %entry
135; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
136; X86-NEXT:    kmovw %eax, %k1
137; X86-NEXT:    vcvtpd2dq %ymm1, %xmm0 {%k1}
138; X86-NEXT:    vzeroupper
139; X86-NEXT:    retl
140;
141; X64-LABEL: test_mm256_mask_cvtpd_epi32:
142; X64:       # %bb.0: # %entry
143; X64-NEXT:    kmovw %edi, %k1
144; X64-NEXT:    vcvtpd2dq %ymm1, %xmm0 {%k1}
145; X64-NEXT:    vzeroupper
146; X64-NEXT:    retq
147entry:
148  %0 = tail call <4 x i32> @llvm.x86.avx.cvt.pd2dq.256(<4 x double> %__A) #8
149  %1 = bitcast <2 x i64> %__W to <4 x i32>
150  %2 = bitcast i8 %__U to <8 x i1>
151  %extract.i = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
152  %3 = select <4 x i1> %extract.i, <4 x i32> %0, <4 x i32> %1
153  %4 = bitcast <4 x i32> %3 to <2 x i64>
154  ret <2 x i64> %4
155}
156
157define <2 x i64> @test_mm256_maskz_cvtpd_epi32(i8 zeroext %__U, <4 x double> %__A) {
158; X86-LABEL: test_mm256_maskz_cvtpd_epi32:
159; X86:       # %bb.0: # %entry
160; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
161; X86-NEXT:    kmovw %eax, %k1
162; X86-NEXT:    vcvtpd2dq %ymm0, %xmm0 {%k1} {z}
163; X86-NEXT:    vzeroupper
164; X86-NEXT:    retl
165;
166; X64-LABEL: test_mm256_maskz_cvtpd_epi32:
167; X64:       # %bb.0: # %entry
168; X64-NEXT:    kmovw %edi, %k1
169; X64-NEXT:    vcvtpd2dq %ymm0, %xmm0 {%k1} {z}
170; X64-NEXT:    vzeroupper
171; X64-NEXT:    retq
172entry:
173  %0 = tail call <4 x i32> @llvm.x86.avx.cvt.pd2dq.256(<4 x double> %__A) #8
174  %1 = bitcast i8 %__U to <8 x i1>
175  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
176  %2 = select <4 x i1> %extract.i, <4 x i32> %0, <4 x i32> zeroinitializer
177  %3 = bitcast <4 x i32> %2 to <2 x i64>
178  ret <2 x i64> %3
179}
180
181define <4 x float> @test_mm_mask_cvtpd_ps(<4 x float> %__W, i8 zeroext %__U, <2 x double> %__A) {
182; X86-LABEL: test_mm_mask_cvtpd_ps:
183; X86:       # %bb.0: # %entry
184; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
185; X86-NEXT:    kmovw %eax, %k1
186; X86-NEXT:    vcvtpd2ps %xmm1, %xmm0 {%k1}
187; X86-NEXT:    retl
188;
189; X64-LABEL: test_mm_mask_cvtpd_ps:
190; X64:       # %bb.0: # %entry
191; X64-NEXT:    kmovw %edi, %k1
192; X64-NEXT:    vcvtpd2ps %xmm1, %xmm0 {%k1}
193; X64-NEXT:    retq
194entry:
195  %0 = tail call <4 x float> @llvm.x86.avx512.mask.cvtpd2ps(<2 x double> %__A, <4 x float> %__W, i8 %__U) #8
196  ret <4 x float> %0
197}
198
199define <4 x float> @test_mm_maskz_cvtpd_ps(i8 zeroext %__U, <2 x double> %__A) {
200; X86-LABEL: test_mm_maskz_cvtpd_ps:
201; X86:       # %bb.0: # %entry
202; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
203; X86-NEXT:    kmovw %eax, %k1
204; X86-NEXT:    vcvtpd2ps %xmm0, %xmm0 {%k1} {z}
205; X86-NEXT:    retl
206;
207; X64-LABEL: test_mm_maskz_cvtpd_ps:
208; X64:       # %bb.0: # %entry
209; X64-NEXT:    kmovw %edi, %k1
210; X64-NEXT:    vcvtpd2ps %xmm0, %xmm0 {%k1} {z}
211; X64-NEXT:    retq
212entry:
213  %0 = tail call <4 x float> @llvm.x86.avx512.mask.cvtpd2ps(<2 x double> %__A, <4 x float> zeroinitializer, i8 %__U) #8
214  ret <4 x float> %0
215}
216
217define <4 x float> @test_mm256_mask_cvtpd_ps(<4 x float> %__W, i8 zeroext %__U, <4 x double> %__A) {
218; X86-LABEL: test_mm256_mask_cvtpd_ps:
219; X86:       # %bb.0: # %entry
220; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
221; X86-NEXT:    kmovw %eax, %k1
222; X86-NEXT:    vcvtpd2ps %ymm1, %xmm0 {%k1}
223; X86-NEXT:    vzeroupper
224; X86-NEXT:    retl
225;
226; X64-LABEL: test_mm256_mask_cvtpd_ps:
227; X64:       # %bb.0: # %entry
228; X64-NEXT:    kmovw %edi, %k1
229; X64-NEXT:    vcvtpd2ps %ymm1, %xmm0 {%k1}
230; X64-NEXT:    vzeroupper
231; X64-NEXT:    retq
232entry:
233  %0 = tail call <4 x float> @llvm.x86.avx.cvt.pd2.ps.256(<4 x double> %__A) #8
234  %1 = bitcast i8 %__U to <8 x i1>
235  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
236  %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> %__W
237  ret <4 x float> %2
238}
239
240define <4 x float> @test_mm256_maskz_cvtpd_ps(i8 zeroext %__U, <4 x double> %__A) {
241; X86-LABEL: test_mm256_maskz_cvtpd_ps:
242; X86:       # %bb.0: # %entry
243; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
244; X86-NEXT:    kmovw %eax, %k1
245; X86-NEXT:    vcvtpd2ps %ymm0, %xmm0 {%k1} {z}
246; X86-NEXT:    vzeroupper
247; X86-NEXT:    retl
248;
249; X64-LABEL: test_mm256_maskz_cvtpd_ps:
250; X64:       # %bb.0: # %entry
251; X64-NEXT:    kmovw %edi, %k1
252; X64-NEXT:    vcvtpd2ps %ymm0, %xmm0 {%k1} {z}
253; X64-NEXT:    vzeroupper
254; X64-NEXT:    retq
255entry:
256  %0 = tail call <4 x float> @llvm.x86.avx.cvt.pd2.ps.256(<4 x double> %__A) #8
257  %1 = bitcast i8 %__U to <8 x i1>
258  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
259  %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> zeroinitializer
260  ret <4 x float> %2
261}
262
263define <2 x i64> @test_mm_cvtpd_epu32(<2 x double> %__A) {
264; CHECK-LABEL: test_mm_cvtpd_epu32:
265; CHECK:       # %bb.0: # %entry
266; CHECK-NEXT:    vcvtpd2udq %xmm0, %xmm0
267; CHECK-NEXT:    ret{{[l|q]}}
268entry:
269  %0 = tail call <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.128(<2 x double> %__A, <4 x i32> zeroinitializer, i8 -1) #8
270  %1 = bitcast <4 x i32> %0 to <2 x i64>
271  ret <2 x i64> %1
272}
273
274define <2 x i64> @test_mm_mask_cvtpd_epu32(<2 x i64> %__W, i8 zeroext %__U, <2 x double> %__A) {
275; X86-LABEL: test_mm_mask_cvtpd_epu32:
276; X86:       # %bb.0: # %entry
277; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
278; X86-NEXT:    kmovw %eax, %k1
279; X86-NEXT:    vcvtpd2udq %xmm1, %xmm0 {%k1}
280; X86-NEXT:    retl
281;
282; X64-LABEL: test_mm_mask_cvtpd_epu32:
283; X64:       # %bb.0: # %entry
284; X64-NEXT:    kmovw %edi, %k1
285; X64-NEXT:    vcvtpd2udq %xmm1, %xmm0 {%k1}
286; X64-NEXT:    retq
287entry:
288  %0 = bitcast <2 x i64> %__W to <4 x i32>
289  %1 = tail call <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.128(<2 x double> %__A, <4 x i32> %0, i8 %__U) #8
290  %2 = bitcast <4 x i32> %1 to <2 x i64>
291  ret <2 x i64> %2
292}
293
294define <2 x i64> @test_mm_maskz_cvtpd_epu32(i8 zeroext %__U, <2 x double> %__A) {
295; X86-LABEL: test_mm_maskz_cvtpd_epu32:
296; X86:       # %bb.0: # %entry
297; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
298; X86-NEXT:    kmovw %eax, %k1
299; X86-NEXT:    vcvtpd2udq %xmm0, %xmm0 {%k1} {z}
300; X86-NEXT:    retl
301;
302; X64-LABEL: test_mm_maskz_cvtpd_epu32:
303; X64:       # %bb.0: # %entry
304; X64-NEXT:    kmovw %edi, %k1
305; X64-NEXT:    vcvtpd2udq %xmm0, %xmm0 {%k1} {z}
306; X64-NEXT:    retq
307entry:
308  %0 = tail call <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.128(<2 x double> %__A, <4 x i32> zeroinitializer, i8 %__U) #8
309  %1 = bitcast <4 x i32> %0 to <2 x i64>
310  ret <2 x i64> %1
311}
312
313define <2 x i64> @test_mm256_cvtpd_epu32(<4 x double> %__A) {
314; CHECK-LABEL: test_mm256_cvtpd_epu32:
315; CHECK:       # %bb.0: # %entry
316; CHECK-NEXT:    vcvtpd2udq %ymm0, %xmm0
317; CHECK-NEXT:    vzeroupper
318; CHECK-NEXT:    ret{{[l|q]}}
319entry:
320  %0 = tail call <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.256(<4 x double> %__A, <4 x i32> zeroinitializer, i8 -1) #8
321  %1 = bitcast <4 x i32> %0 to <2 x i64>
322  ret <2 x i64> %1
323}
324
325define <2 x i64> @test_mm256_mask_cvtpd_epu32(<2 x i64> %__W, i8 zeroext %__U, <4 x double> %__A) {
326; X86-LABEL: test_mm256_mask_cvtpd_epu32:
327; X86:       # %bb.0: # %entry
328; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
329; X86-NEXT:    kmovw %eax, %k1
330; X86-NEXT:    vcvtpd2udq %ymm1, %xmm0 {%k1}
331; X86-NEXT:    vzeroupper
332; X86-NEXT:    retl
333;
334; X64-LABEL: test_mm256_mask_cvtpd_epu32:
335; X64:       # %bb.0: # %entry
336; X64-NEXT:    kmovw %edi, %k1
337; X64-NEXT:    vcvtpd2udq %ymm1, %xmm0 {%k1}
338; X64-NEXT:    vzeroupper
339; X64-NEXT:    retq
340entry:
341  %0 = bitcast <2 x i64> %__W to <4 x i32>
342  %1 = tail call <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.256(<4 x double> %__A, <4 x i32> %0, i8 %__U) #8
343  %2 = bitcast <4 x i32> %1 to <2 x i64>
344  ret <2 x i64> %2
345}
346
347define <2 x i64> @test_mm256_maskz_cvtpd_epu32(i8 zeroext %__U, <4 x double> %__A) {
348; X86-LABEL: test_mm256_maskz_cvtpd_epu32:
349; X86:       # %bb.0: # %entry
350; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
351; X86-NEXT:    kmovw %eax, %k1
352; X86-NEXT:    vcvtpd2udq %ymm0, %xmm0 {%k1} {z}
353; X86-NEXT:    vzeroupper
354; X86-NEXT:    retl
355;
356; X64-LABEL: test_mm256_maskz_cvtpd_epu32:
357; X64:       # %bb.0: # %entry
358; X64-NEXT:    kmovw %edi, %k1
359; X64-NEXT:    vcvtpd2udq %ymm0, %xmm0 {%k1} {z}
360; X64-NEXT:    vzeroupper
361; X64-NEXT:    retq
362entry:
363  %0 = tail call <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.256(<4 x double> %__A, <4 x i32> zeroinitializer, i8 %__U) #8
364  %1 = bitcast <4 x i32> %0 to <2 x i64>
365  ret <2 x i64> %1
366}
367
368define <4 x float> @test_mm_mask_cvtph_ps(<4 x float> %__W, i8 zeroext %__U, <2 x i64> %__A) {
369; X86-LABEL: test_mm_mask_cvtph_ps:
370; X86:       # %bb.0: # %entry
371; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
372; X86-NEXT:    kmovw %eax, %k1
373; X86-NEXT:    vcvtph2ps %xmm1, %xmm0 {%k1}
374; X86-NEXT:    retl
375;
376; X64-LABEL: test_mm_mask_cvtph_ps:
377; X64:       # %bb.0: # %entry
378; X64-NEXT:    kmovw %edi, %k1
379; X64-NEXT:    vcvtph2ps %xmm1, %xmm0 {%k1}
380; X64-NEXT:    retq
381entry:
382  %0 = bitcast <2 x i64> %__A to <8 x i16>
383  %1 = shufflevector <8 x i16> %0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
384  %2 = bitcast <4 x i16> %1 to <4 x half>
385  %3 = bitcast i8 %__U to <8 x i1>
386  %4 = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
387  %5 = fpext <4 x half> %2 to <4 x float>
388  %6 = select <4 x i1> %4, <4 x float> %5, <4 x float> %__W
389  ret <4 x float> %6
390}
391
392define <4 x float> @test_mm_maskz_cvtph_ps(i8 zeroext %__U, <2 x i64> %__A) {
393; X86-LABEL: test_mm_maskz_cvtph_ps:
394; X86:       # %bb.0: # %entry
395; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
396; X86-NEXT:    kmovw %eax, %k1
397; X86-NEXT:    vcvtph2ps %xmm0, %xmm0 {%k1} {z}
398; X86-NEXT:    retl
399;
400; X64-LABEL: test_mm_maskz_cvtph_ps:
401; X64:       # %bb.0: # %entry
402; X64-NEXT:    kmovw %edi, %k1
403; X64-NEXT:    vcvtph2ps %xmm0, %xmm0 {%k1} {z}
404; X64-NEXT:    retq
405entry:
406  %0 = bitcast <2 x i64> %__A to <8 x i16>
407  %1 = shufflevector <8 x i16> %0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
408  %2 = bitcast <4 x i16> %1 to <4 x half>
409  %3 = bitcast i8 %__U to <8 x i1>
410  %4 = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
411  %5 = fpext <4 x half> %2 to <4 x float>
412  %6 = select <4 x i1> %4, <4 x float> %5, <4 x float> zeroinitializer
413  ret <4 x float> %6
414}
415
416define <8 x float> @test_mm256_mask_cvtph_ps(<8 x float> %__W, i8 zeroext %__U, <2 x i64> %__A) {
417; X86-LABEL: test_mm256_mask_cvtph_ps:
418; X86:       # %bb.0: # %entry
419; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
420; X86-NEXT:    kmovw %eax, %k1
421; X86-NEXT:    vcvtph2ps %xmm1, %ymm0 {%k1}
422; X86-NEXT:    retl
423;
424; X64-LABEL: test_mm256_mask_cvtph_ps:
425; X64:       # %bb.0: # %entry
426; X64-NEXT:    kmovw %edi, %k1
427; X64-NEXT:    vcvtph2ps %xmm1, %ymm0 {%k1}
428; X64-NEXT:    retq
429entry:
430  %0 = bitcast <2 x i64> %__A to <8 x i16>
431  %1 = bitcast <8 x i16> %0 to <8 x half>
432  %2 = bitcast i8 %__U to <8 x i1>
433  %3 = fpext <8 x half> %1 to <8 x float>
434  %4 = select <8 x i1> %2, <8 x float> %3, <8 x float> %__W
435  ret <8 x float> %4
436}
437
438define <8 x float> @test_mm256_maskz_cvtph_ps(i8 zeroext %__U, <2 x i64> %__A) {
439; X86-LABEL: test_mm256_maskz_cvtph_ps:
440; X86:       # %bb.0: # %entry
441; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
442; X86-NEXT:    kmovw %eax, %k1
443; X86-NEXT:    vcvtph2ps %xmm0, %ymm0 {%k1} {z}
444; X86-NEXT:    retl
445;
446; X64-LABEL: test_mm256_maskz_cvtph_ps:
447; X64:       # %bb.0: # %entry
448; X64-NEXT:    kmovw %edi, %k1
449; X64-NEXT:    vcvtph2ps %xmm0, %ymm0 {%k1} {z}
450; X64-NEXT:    retq
451entry:
452  %0 = bitcast <2 x i64> %__A to <8 x i16>
453  %1 = bitcast <8 x i16> %0 to <8 x half>
454  %2 = bitcast i8 %__U to <8 x i1>
455  %3 = fpext <8 x half> %1 to <8 x float>
456  %4 = select <8 x i1> %2, <8 x float> %3, <8 x float> zeroinitializer
457  ret <8 x float> %4
458}
459
460define <2 x i64> @test_mm_mask_cvtps_epi32(<2 x i64> %__W, i8 zeroext %__U, <4 x float> %__A) {
461; X86-LABEL: test_mm_mask_cvtps_epi32:
462; X86:       # %bb.0: # %entry
463; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
464; X86-NEXT:    kmovw %eax, %k1
465; X86-NEXT:    vcvtps2dq %xmm1, %xmm0 {%k1}
466; X86-NEXT:    retl
467;
468; X64-LABEL: test_mm_mask_cvtps_epi32:
469; X64:       # %bb.0: # %entry
470; X64-NEXT:    kmovw %edi, %k1
471; X64-NEXT:    vcvtps2dq %xmm1, %xmm0 {%k1}
472; X64-NEXT:    retq
473entry:
474  %0 = tail call <4 x i32> @llvm.x86.sse2.cvtps2dq(<4 x float> %__A) #8
475  %1 = bitcast <2 x i64> %__W to <4 x i32>
476  %2 = bitcast i8 %__U to <8 x i1>
477  %extract.i = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
478  %3 = select <4 x i1> %extract.i, <4 x i32> %0, <4 x i32> %1
479  %4 = bitcast <4 x i32> %3 to <2 x i64>
480  ret <2 x i64> %4
481}
482
483define <2 x i64> @test_mm_maskz_cvtps_epi32(i8 zeroext %__U, <4 x float> %__A) {
484; X86-LABEL: test_mm_maskz_cvtps_epi32:
485; X86:       # %bb.0: # %entry
486; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
487; X86-NEXT:    kmovw %eax, %k1
488; X86-NEXT:    vcvtps2dq %xmm0, %xmm0 {%k1} {z}
489; X86-NEXT:    retl
490;
491; X64-LABEL: test_mm_maskz_cvtps_epi32:
492; X64:       # %bb.0: # %entry
493; X64-NEXT:    kmovw %edi, %k1
494; X64-NEXT:    vcvtps2dq %xmm0, %xmm0 {%k1} {z}
495; X64-NEXT:    retq
496entry:
497  %0 = tail call <4 x i32> @llvm.x86.sse2.cvtps2dq(<4 x float> %__A) #8
498  %1 = bitcast i8 %__U to <8 x i1>
499  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
500  %2 = select <4 x i1> %extract.i, <4 x i32> %0, <4 x i32> zeroinitializer
501  %3 = bitcast <4 x i32> %2 to <2 x i64>
502  ret <2 x i64> %3
503}
504
505define <4 x i64> @test_mm256_mask_cvtps_epi32(<4 x i64> %__W, i8 zeroext %__U, <8 x float> %__A) {
506; X86-LABEL: test_mm256_mask_cvtps_epi32:
507; X86:       # %bb.0: # %entry
508; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
509; X86-NEXT:    kmovw %eax, %k1
510; X86-NEXT:    vcvtps2dq %ymm1, %ymm0 {%k1}
511; X86-NEXT:    retl
512;
513; X64-LABEL: test_mm256_mask_cvtps_epi32:
514; X64:       # %bb.0: # %entry
515; X64-NEXT:    kmovw %edi, %k1
516; X64-NEXT:    vcvtps2dq %ymm1, %ymm0 {%k1}
517; X64-NEXT:    retq
518entry:
519  %0 = tail call <8 x i32> @llvm.x86.avx.cvt.ps2dq.256(<8 x float> %__A) #8
520  %1 = bitcast <4 x i64> %__W to <8 x i32>
521  %2 = bitcast i8 %__U to <8 x i1>
522  %3 = select <8 x i1> %2, <8 x i32> %0, <8 x i32> %1
523  %4 = bitcast <8 x i32> %3 to <4 x i64>
524  ret <4 x i64> %4
525}
526
527define <4 x i64> @test_mm256_maskz_cvtps_epi32(i8 zeroext %__U, <8 x float> %__A) {
528; X86-LABEL: test_mm256_maskz_cvtps_epi32:
529; X86:       # %bb.0: # %entry
530; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
531; X86-NEXT:    kmovw %eax, %k1
532; X86-NEXT:    vcvtps2dq %ymm0, %ymm0 {%k1} {z}
533; X86-NEXT:    retl
534;
535; X64-LABEL: test_mm256_maskz_cvtps_epi32:
536; X64:       # %bb.0: # %entry
537; X64-NEXT:    kmovw %edi, %k1
538; X64-NEXT:    vcvtps2dq %ymm0, %ymm0 {%k1} {z}
539; X64-NEXT:    retq
540entry:
541  %0 = tail call <8 x i32> @llvm.x86.avx.cvt.ps2dq.256(<8 x float> %__A) #8
542  %1 = bitcast i8 %__U to <8 x i1>
543  %2 = select <8 x i1> %1, <8 x i32> %0, <8 x i32> zeroinitializer
544  %3 = bitcast <8 x i32> %2 to <4 x i64>
545  ret <4 x i64> %3
546}
547
548define <2 x double> @test_mm_mask_cvtps_pd(<2 x double> %__W, i8 zeroext %__U, <4 x float> %__A) local_unnamed_addr #0 {
549; X86-LABEL: test_mm_mask_cvtps_pd:
550; X86:       # %bb.0: # %entry
551; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
552; X86-NEXT:    kmovw %eax, %k1
553; X86-NEXT:    vcvtps2pd %xmm1, %xmm0 {%k1}
554; X86-NEXT:    retl
555;
556; X64-LABEL: test_mm_mask_cvtps_pd:
557; X64:       # %bb.0: # %entry
558; X64-NEXT:    kmovw %edi, %k1
559; X64-NEXT:    vcvtps2pd %xmm1, %xmm0 {%k1}
560; X64-NEXT:    retq
561entry:
562  %shuffle.i.i = shufflevector <4 x float> %__A, <4 x float> undef, <2 x i32> <i32 0, i32 1>
563  %conv.i.i = fpext <2 x float> %shuffle.i.i to <2 x double>
564  %0 = bitcast i8 %__U to <8 x i1>
565  %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
566  %1 = select <2 x i1> %extract.i, <2 x double> %conv.i.i, <2 x double> %__W
567  ret <2 x double> %1
568}
569
570define <2 x double> @test_mm_maskz_cvtps_pd(i8 zeroext %__U, <4 x float> %__A) local_unnamed_addr #0 {
571; X86-LABEL: test_mm_maskz_cvtps_pd:
572; X86:       # %bb.0: # %entry
573; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
574; X86-NEXT:    kmovw %eax, %k1
575; X86-NEXT:    vcvtps2pd %xmm0, %xmm0 {%k1} {z}
576; X86-NEXT:    retl
577;
578; X64-LABEL: test_mm_maskz_cvtps_pd:
579; X64:       # %bb.0: # %entry
580; X64-NEXT:    kmovw %edi, %k1
581; X64-NEXT:    vcvtps2pd %xmm0, %xmm0 {%k1} {z}
582; X64-NEXT:    retq
583entry:
584  %shuffle.i.i = shufflevector <4 x float> %__A, <4 x float> undef, <2 x i32> <i32 0, i32 1>
585  %conv.i.i = fpext <2 x float> %shuffle.i.i to <2 x double>
586  %0 = bitcast i8 %__U to <8 x i1>
587  %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
588  %1 = select <2 x i1> %extract.i, <2 x double> %conv.i.i, <2 x double> zeroinitializer
589  ret <2 x double> %1
590}
591
592define <4 x double> @test_mm256_mask_cvtps_pd(<4 x double> %__W, i8 zeroext %__U, <4 x float> %__A) local_unnamed_addr #0 {
593; X86-LABEL: test_mm256_mask_cvtps_pd:
594; X86:       # %bb.0: # %entry
595; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
596; X86-NEXT:    kmovw %eax, %k1
597; X86-NEXT:    vcvtps2pd %xmm1, %ymm0 {%k1}
598; X86-NEXT:    retl
599;
600; X64-LABEL: test_mm256_mask_cvtps_pd:
601; X64:       # %bb.0: # %entry
602; X64-NEXT:    kmovw %edi, %k1
603; X64-NEXT:    vcvtps2pd %xmm1, %ymm0 {%k1}
604; X64-NEXT:    retq
605entry:
606  %conv.i.i = fpext <4 x float> %__A to <4 x double>
607  %0 = bitcast i8 %__U to <8 x i1>
608  %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
609  %1 = select <4 x i1> %extract.i, <4 x double> %conv.i.i, <4 x double> %__W
610  ret <4 x double> %1
611}
612
613define <4 x double> @test_mm256_maskz_cvtps_pd(i8 zeroext %__U, <4 x float> %__A) local_unnamed_addr #0 {
614; X86-LABEL: test_mm256_maskz_cvtps_pd:
615; X86:       # %bb.0: # %entry
616; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
617; X86-NEXT:    kmovw %eax, %k1
618; X86-NEXT:    vcvtps2pd %xmm0, %ymm0 {%k1} {z}
619; X86-NEXT:    retl
620;
621; X64-LABEL: test_mm256_maskz_cvtps_pd:
622; X64:       # %bb.0: # %entry
623; X64-NEXT:    kmovw %edi, %k1
624; X64-NEXT:    vcvtps2pd %xmm0, %ymm0 {%k1} {z}
625; X64-NEXT:    retq
626entry:
627  %conv.i.i = fpext <4 x float> %__A to <4 x double>
628  %0 = bitcast i8 %__U to <8 x i1>
629  %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
630  %1 = select <4 x i1> %extract.i, <4 x double> %conv.i.i, <4 x double> zeroinitializer
631  ret <4 x double> %1
632}
633
634define <2 x i64> @test_mm_cvtps_epu32(<4 x float> %__A) {
635; CHECK-LABEL: test_mm_cvtps_epu32:
636; CHECK:       # %bb.0: # %entry
637; CHECK-NEXT:    vcvtps2udq %xmm0, %xmm0
638; CHECK-NEXT:    ret{{[l|q]}}
639entry:
640  %0 = tail call <4 x i32> @llvm.x86.avx512.mask.cvtps2udq.128(<4 x float> %__A, <4 x i32> zeroinitializer, i8 -1) #8
641  %1 = bitcast <4 x i32> %0 to <2 x i64>
642  ret <2 x i64> %1
643}
644
645define <2 x i64> @test_mm_mask_cvtps_epu32(<2 x i64> %__W, i8 zeroext %__U, <4 x float> %__A) {
646; X86-LABEL: test_mm_mask_cvtps_epu32:
647; X86:       # %bb.0: # %entry
648; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
649; X86-NEXT:    kmovw %eax, %k1
650; X86-NEXT:    vcvtps2udq %xmm1, %xmm0 {%k1}
651; X86-NEXT:    retl
652;
653; X64-LABEL: test_mm_mask_cvtps_epu32:
654; X64:       # %bb.0: # %entry
655; X64-NEXT:    kmovw %edi, %k1
656; X64-NEXT:    vcvtps2udq %xmm1, %xmm0 {%k1}
657; X64-NEXT:    retq
658entry:
659  %0 = bitcast <2 x i64> %__W to <4 x i32>
660  %1 = tail call <4 x i32> @llvm.x86.avx512.mask.cvtps2udq.128(<4 x float> %__A, <4 x i32> %0, i8 %__U) #8
661  %2 = bitcast <4 x i32> %1 to <2 x i64>
662  ret <2 x i64> %2
663}
664
665define <2 x i64> @test_mm_maskz_cvtps_epu32(i8 zeroext %__U, <4 x float> %__A) {
666; X86-LABEL: test_mm_maskz_cvtps_epu32:
667; X86:       # %bb.0: # %entry
668; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
669; X86-NEXT:    kmovw %eax, %k1
670; X86-NEXT:    vcvtps2udq %xmm0, %xmm0 {%k1} {z}
671; X86-NEXT:    retl
672;
673; X64-LABEL: test_mm_maskz_cvtps_epu32:
674; X64:       # %bb.0: # %entry
675; X64-NEXT:    kmovw %edi, %k1
676; X64-NEXT:    vcvtps2udq %xmm0, %xmm0 {%k1} {z}
677; X64-NEXT:    retq
678entry:
679  %0 = tail call <4 x i32> @llvm.x86.avx512.mask.cvtps2udq.128(<4 x float> %__A, <4 x i32> zeroinitializer, i8 %__U) #8
680  %1 = bitcast <4 x i32> %0 to <2 x i64>
681  ret <2 x i64> %1
682}
683
684define <4 x i64> @test_mm256_cvtps_epu32(<8 x float> %__A) {
685; CHECK-LABEL: test_mm256_cvtps_epu32:
686; CHECK:       # %bb.0: # %entry
687; CHECK-NEXT:    vcvtps2udq %ymm0, %ymm0
688; CHECK-NEXT:    ret{{[l|q]}}
689entry:
690  %0 = tail call <8 x i32> @llvm.x86.avx512.mask.cvtps2udq.256(<8 x float> %__A, <8 x i32> zeroinitializer, i8 -1) #8
691  %1 = bitcast <8 x i32> %0 to <4 x i64>
692  ret <4 x i64> %1
693}
694
695define <4 x i64> @test_mm256_mask_cvtps_epu32(<4 x i64> %__W, i8 zeroext %__U, <8 x float> %__A) {
696; X86-LABEL: test_mm256_mask_cvtps_epu32:
697; X86:       # %bb.0: # %entry
698; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
699; X86-NEXT:    kmovw %eax, %k1
700; X86-NEXT:    vcvtps2udq %ymm1, %ymm0 {%k1}
701; X86-NEXT:    retl
702;
703; X64-LABEL: test_mm256_mask_cvtps_epu32:
704; X64:       # %bb.0: # %entry
705; X64-NEXT:    kmovw %edi, %k1
706; X64-NEXT:    vcvtps2udq %ymm1, %ymm0 {%k1}
707; X64-NEXT:    retq
708entry:
709  %0 = bitcast <4 x i64> %__W to <8 x i32>
710  %1 = tail call <8 x i32> @llvm.x86.avx512.mask.cvtps2udq.256(<8 x float> %__A, <8 x i32> %0, i8 %__U) #8
711  %2 = bitcast <8 x i32> %1 to <4 x i64>
712  ret <4 x i64> %2
713}
714
715define <4 x i64> @test_mm256_maskz_cvtps_epu32(i8 zeroext %__U, <8 x float> %__A) {
716; X86-LABEL: test_mm256_maskz_cvtps_epu32:
717; X86:       # %bb.0: # %entry
718; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
719; X86-NEXT:    kmovw %eax, %k1
720; X86-NEXT:    vcvtps2udq %ymm0, %ymm0 {%k1} {z}
721; X86-NEXT:    retl
722;
723; X64-LABEL: test_mm256_maskz_cvtps_epu32:
724; X64:       # %bb.0: # %entry
725; X64-NEXT:    kmovw %edi, %k1
726; X64-NEXT:    vcvtps2udq %ymm0, %ymm0 {%k1} {z}
727; X64-NEXT:    retq
728entry:
729  %0 = tail call <8 x i32> @llvm.x86.avx512.mask.cvtps2udq.256(<8 x float> %__A, <8 x i32> zeroinitializer, i8 %__U) #8
730  %1 = bitcast <8 x i32> %0 to <4 x i64>
731  ret <4 x i64> %1
732}
733
734define <2 x i64> @test_mm_mask_cvttpd_epi32(<2 x i64> %__W, i8 zeroext %__U, <2 x double> %__A) {
735; X86-LABEL: test_mm_mask_cvttpd_epi32:
736; X86:       # %bb.0: # %entry
737; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
738; X86-NEXT:    kmovw %eax, %k1
739; X86-NEXT:    vcvttpd2dq %xmm1, %xmm0 {%k1}
740; X86-NEXT:    retl
741;
742; X64-LABEL: test_mm_mask_cvttpd_epi32:
743; X64:       # %bb.0: # %entry
744; X64-NEXT:    kmovw %edi, %k1
745; X64-NEXT:    vcvttpd2dq %xmm1, %xmm0 {%k1}
746; X64-NEXT:    retq
747entry:
748  %0 = bitcast <2 x i64> %__W to <4 x i32>
749  %1 = tail call <4 x i32> @llvm.x86.avx512.mask.cvttpd2dq.128(<2 x double> %__A, <4 x i32> %0, i8 %__U) #8
750  %2 = bitcast <4 x i32> %1 to <2 x i64>
751  ret <2 x i64> %2
752}
753
754define <2 x i64> @test_mm_maskz_cvttpd_epi32(i8 zeroext %__U, <2 x double> %__A) {
755; X86-LABEL: test_mm_maskz_cvttpd_epi32:
756; X86:       # %bb.0: # %entry
757; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
758; X86-NEXT:    kmovw %eax, %k1
759; X86-NEXT:    vcvttpd2dq %xmm0, %xmm0 {%k1} {z}
760; X86-NEXT:    retl
761;
762; X64-LABEL: test_mm_maskz_cvttpd_epi32:
763; X64:       # %bb.0: # %entry
764; X64-NEXT:    kmovw %edi, %k1
765; X64-NEXT:    vcvttpd2dq %xmm0, %xmm0 {%k1} {z}
766; X64-NEXT:    retq
767entry:
768  %0 = tail call <4 x i32> @llvm.x86.avx512.mask.cvttpd2dq.128(<2 x double> %__A, <4 x i32> zeroinitializer, i8 %__U) #8
769  %1 = bitcast <4 x i32> %0 to <2 x i64>
770  ret <2 x i64> %1
771}
772
773define <2 x i64> @test_mm256_mask_cvttpd_epi32(<2 x i64> %__W, i8 zeroext %__U, <4 x double> %__A) {
774; X86-LABEL: test_mm256_mask_cvttpd_epi32:
775; X86:       # %bb.0: # %entry
776; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
777; X86-NEXT:    kmovw %eax, %k1
778; X86-NEXT:    vcvttpd2dq %ymm1, %xmm0 {%k1}
779; X86-NEXT:    vzeroupper
780; X86-NEXT:    retl
781;
782; X64-LABEL: test_mm256_mask_cvttpd_epi32:
783; X64:       # %bb.0: # %entry
784; X64-NEXT:    kmovw %edi, %k1
785; X64-NEXT:    vcvttpd2dq %ymm1, %xmm0 {%k1}
786; X64-NEXT:    vzeroupper
787; X64-NEXT:    retq
788entry:
789  %0 = tail call <4 x i32> @llvm.x86.avx.cvtt.pd2dq.256(<4 x double> %__A) #8
790  %1 = bitcast <2 x i64> %__W to <4 x i32>
791  %2 = bitcast i8 %__U to <8 x i1>
792  %extract.i = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
793  %3 = select <4 x i1> %extract.i, <4 x i32> %0, <4 x i32> %1
794  %4 = bitcast <4 x i32> %3 to <2 x i64>
795  ret <2 x i64> %4
796}
797
798define <2 x i64> @test_mm256_maskz_cvttpd_epi32(i8 zeroext %__U, <4 x double> %__A) {
799; X86-LABEL: test_mm256_maskz_cvttpd_epi32:
800; X86:       # %bb.0: # %entry
801; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
802; X86-NEXT:    kmovw %eax, %k1
803; X86-NEXT:    vcvttpd2dq %ymm0, %xmm0 {%k1} {z}
804; X86-NEXT:    vzeroupper
805; X86-NEXT:    retl
806;
807; X64-LABEL: test_mm256_maskz_cvttpd_epi32:
808; X64:       # %bb.0: # %entry
809; X64-NEXT:    kmovw %edi, %k1
810; X64-NEXT:    vcvttpd2dq %ymm0, %xmm0 {%k1} {z}
811; X64-NEXT:    vzeroupper
812; X64-NEXT:    retq
813entry:
814  %0 = tail call <4 x i32> @llvm.x86.avx.cvtt.pd2dq.256(<4 x double> %__A) #8
815  %1 = bitcast i8 %__U to <8 x i1>
816  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
817  %2 = select <4 x i1> %extract.i, <4 x i32> %0, <4 x i32> zeroinitializer
818  %3 = bitcast <4 x i32> %2 to <2 x i64>
819  ret <2 x i64> %3
820}
821
822define <2 x i64> @test_mm_cvttpd_epu32(<2 x double> %__A) {
823; CHECK-LABEL: test_mm_cvttpd_epu32:
824; CHECK:       # %bb.0: # %entry
825; CHECK-NEXT:    vcvttpd2udq %xmm0, %xmm0
826; CHECK-NEXT:    ret{{[l|q]}}
827entry:
828  %0 = tail call <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.128(<2 x double> %__A, <4 x i32> zeroinitializer, i8 -1) #8
829  %1 = bitcast <4 x i32> %0 to <2 x i64>
830  ret <2 x i64> %1
831}
832
833define <2 x i64> @test_mm_mask_cvttpd_epu32(<2 x i64> %__W, i8 zeroext %__U, <2 x double> %__A) {
834; X86-LABEL: test_mm_mask_cvttpd_epu32:
835; X86:       # %bb.0: # %entry
836; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
837; X86-NEXT:    kmovw %eax, %k1
838; X86-NEXT:    vcvttpd2udq %xmm1, %xmm0 {%k1}
839; X86-NEXT:    retl
840;
841; X64-LABEL: test_mm_mask_cvttpd_epu32:
842; X64:       # %bb.0: # %entry
843; X64-NEXT:    kmovw %edi, %k1
844; X64-NEXT:    vcvttpd2udq %xmm1, %xmm0 {%k1}
845; X64-NEXT:    retq
846entry:
847  %0 = bitcast <2 x i64> %__W to <4 x i32>
848  %1 = tail call <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.128(<2 x double> %__A, <4 x i32> %0, i8 %__U) #8
849  %2 = bitcast <4 x i32> %1 to <2 x i64>
850  ret <2 x i64> %2
851}
852
853define <2 x i64> @test_mm_maskz_cvttpd_epu32(i8 zeroext %__U, <2 x double> %__A) {
854; X86-LABEL: test_mm_maskz_cvttpd_epu32:
855; X86:       # %bb.0: # %entry
856; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
857; X86-NEXT:    kmovw %eax, %k1
858; X86-NEXT:    vcvttpd2udq %xmm0, %xmm0 {%k1} {z}
859; X86-NEXT:    retl
860;
861; X64-LABEL: test_mm_maskz_cvttpd_epu32:
862; X64:       # %bb.0: # %entry
863; X64-NEXT:    kmovw %edi, %k1
864; X64-NEXT:    vcvttpd2udq %xmm0, %xmm0 {%k1} {z}
865; X64-NEXT:    retq
866entry:
867  %0 = tail call <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.128(<2 x double> %__A, <4 x i32> zeroinitializer, i8 %__U) #8
868  %1 = bitcast <4 x i32> %0 to <2 x i64>
869  ret <2 x i64> %1
870}
871
872define <2 x i64> @test_mm256_cvttpd_epu32(<4 x double> %__A) {
873; CHECK-LABEL: test_mm256_cvttpd_epu32:
874; CHECK:       # %bb.0: # %entry
875; CHECK-NEXT:    vcvttpd2udq %ymm0, %xmm0
876; CHECK-NEXT:    vzeroupper
877; CHECK-NEXT:    ret{{[l|q]}}
878entry:
879  %0 = tail call <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.256(<4 x double> %__A, <4 x i32> zeroinitializer, i8 -1) #8
880  %1 = bitcast <4 x i32> %0 to <2 x i64>
881  ret <2 x i64> %1
882}
883
884define <2 x i64> @test_mm256_mask_cvttpd_epu32(<2 x i64> %__W, i8 zeroext %__U, <4 x double> %__A) {
885; X86-LABEL: test_mm256_mask_cvttpd_epu32:
886; X86:       # %bb.0: # %entry
887; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
888; X86-NEXT:    kmovw %eax, %k1
889; X86-NEXT:    vcvttpd2udq %ymm1, %xmm0 {%k1}
890; X86-NEXT:    vzeroupper
891; X86-NEXT:    retl
892;
893; X64-LABEL: test_mm256_mask_cvttpd_epu32:
894; X64:       # %bb.0: # %entry
895; X64-NEXT:    kmovw %edi, %k1
896; X64-NEXT:    vcvttpd2udq %ymm1, %xmm0 {%k1}
897; X64-NEXT:    vzeroupper
898; X64-NEXT:    retq
899entry:
900  %0 = bitcast <2 x i64> %__W to <4 x i32>
901  %1 = tail call <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.256(<4 x double> %__A, <4 x i32> %0, i8 %__U) #8
902  %2 = bitcast <4 x i32> %1 to <2 x i64>
903  ret <2 x i64> %2
904}
905
906define <2 x i64> @test_mm256_maskz_cvttpd_epu32(i8 zeroext %__U, <4 x double> %__A) {
907; X86-LABEL: test_mm256_maskz_cvttpd_epu32:
908; X86:       # %bb.0: # %entry
909; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
910; X86-NEXT:    kmovw %eax, %k1
911; X86-NEXT:    vcvttpd2udq %ymm0, %xmm0 {%k1} {z}
912; X86-NEXT:    vzeroupper
913; X86-NEXT:    retl
914;
915; X64-LABEL: test_mm256_maskz_cvttpd_epu32:
916; X64:       # %bb.0: # %entry
917; X64-NEXT:    kmovw %edi, %k1
918; X64-NEXT:    vcvttpd2udq %ymm0, %xmm0 {%k1} {z}
919; X64-NEXT:    vzeroupper
920; X64-NEXT:    retq
921entry:
922  %0 = tail call <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.256(<4 x double> %__A, <4 x i32> zeroinitializer, i8 %__U) #8
923  %1 = bitcast <4 x i32> %0 to <2 x i64>
924  ret <2 x i64> %1
925}
926
927define <2 x i64> @test_mm_mask_cvttps_epi32(<2 x i64> %__W, i8 zeroext %__U, <4 x float> %__A) {
928; X86-LABEL: test_mm_mask_cvttps_epi32:
929; X86:       # %bb.0: # %entry
930; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
931; X86-NEXT:    kmovw %eax, %k1
932; X86-NEXT:    vcvttps2dq %xmm1, %xmm0 {%k1}
933; X86-NEXT:    retl
934;
935; X64-LABEL: test_mm_mask_cvttps_epi32:
936; X64:       # %bb.0: # %entry
937; X64-NEXT:    kmovw %edi, %k1
938; X64-NEXT:    vcvttps2dq %xmm1, %xmm0 {%k1}
939; X64-NEXT:    retq
940entry:
941  %0 = tail call <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float> %__A) #8
942  %1 = bitcast <2 x i64> %__W to <4 x i32>
943  %2 = bitcast i8 %__U to <8 x i1>
944  %extract.i = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
945  %3 = select <4 x i1> %extract.i, <4 x i32> %0, <4 x i32> %1
946  %4 = bitcast <4 x i32> %3 to <2 x i64>
947  ret <2 x i64> %4
948}
949
950define <2 x i64> @test_mm_maskz_cvttps_epi32(i8 zeroext %__U, <4 x float> %__A) {
951; X86-LABEL: test_mm_maskz_cvttps_epi32:
952; X86:       # %bb.0: # %entry
953; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
954; X86-NEXT:    kmovw %eax, %k1
955; X86-NEXT:    vcvttps2dq %xmm0, %xmm0 {%k1} {z}
956; X86-NEXT:    retl
957;
958; X64-LABEL: test_mm_maskz_cvttps_epi32:
959; X64:       # %bb.0: # %entry
960; X64-NEXT:    kmovw %edi, %k1
961; X64-NEXT:    vcvttps2dq %xmm0, %xmm0 {%k1} {z}
962; X64-NEXT:    retq
963entry:
964  %0 = tail call <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float> %__A) #8
965  %1 = bitcast i8 %__U to <8 x i1>
966  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
967  %2 = select <4 x i1> %extract.i, <4 x i32> %0, <4 x i32> zeroinitializer
968  %3 = bitcast <4 x i32> %2 to <2 x i64>
969  ret <2 x i64> %3
970}
971
972define <4 x i64> @test_mm256_mask_cvttps_epi32(<4 x i64> %__W, i8 zeroext %__U, <8 x float> %__A) {
973; X86-LABEL: test_mm256_mask_cvttps_epi32:
974; X86:       # %bb.0: # %entry
975; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
976; X86-NEXT:    kmovw %eax, %k1
977; X86-NEXT:    vcvttps2dq %ymm1, %ymm0 {%k1}
978; X86-NEXT:    retl
979;
980; X64-LABEL: test_mm256_mask_cvttps_epi32:
981; X64:       # %bb.0: # %entry
982; X64-NEXT:    kmovw %edi, %k1
983; X64-NEXT:    vcvttps2dq %ymm1, %ymm0 {%k1}
984; X64-NEXT:    retq
985entry:
986  %0 = tail call <8 x i32> @llvm.x86.avx.cvtt.ps2dq.256(<8 x float> %__A) #8
987  %1 = bitcast <4 x i64> %__W to <8 x i32>
988  %2 = bitcast i8 %__U to <8 x i1>
989  %3 = select <8 x i1> %2, <8 x i32> %0, <8 x i32> %1
990  %4 = bitcast <8 x i32> %3 to <4 x i64>
991  ret <4 x i64> %4
992}
993
994define <4 x i64> @test_mm256_maskz_cvttps_epi32(i8 zeroext %__U, <8 x float> %__A) {
995; X86-LABEL: test_mm256_maskz_cvttps_epi32:
996; X86:       # %bb.0: # %entry
997; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
998; X86-NEXT:    kmovw %eax, %k1
999; X86-NEXT:    vcvttps2dq %ymm0, %ymm0 {%k1} {z}
1000; X86-NEXT:    retl
1001;
1002; X64-LABEL: test_mm256_maskz_cvttps_epi32:
1003; X64:       # %bb.0: # %entry
1004; X64-NEXT:    kmovw %edi, %k1
1005; X64-NEXT:    vcvttps2dq %ymm0, %ymm0 {%k1} {z}
1006; X64-NEXT:    retq
1007entry:
1008  %0 = tail call <8 x i32> @llvm.x86.avx.cvtt.ps2dq.256(<8 x float> %__A) #8
1009  %1 = bitcast i8 %__U to <8 x i1>
1010  %2 = select <8 x i1> %1, <8 x i32> %0, <8 x i32> zeroinitializer
1011  %3 = bitcast <8 x i32> %2 to <4 x i64>
1012  ret <4 x i64> %3
1013}
1014
1015define <2 x i64> @test_mm_cvttps_epu32(<4 x float> %__A) {
1016; CHECK-LABEL: test_mm_cvttps_epu32:
1017; CHECK:       # %bb.0: # %entry
1018; CHECK-NEXT:    vcvttps2udq %xmm0, %xmm0
1019; CHECK-NEXT:    ret{{[l|q]}}
1020entry:
1021  %0 = tail call <4 x i32> @llvm.x86.avx512.mask.cvttps2udq.128(<4 x float> %__A, <4 x i32> zeroinitializer, i8 -1) #8
1022  %1 = bitcast <4 x i32> %0 to <2 x i64>
1023  ret <2 x i64> %1
1024}
1025
1026define <2 x i64> @test_mm_mask_cvttps_epu32(<2 x i64> %__W, i8 zeroext %__U, <4 x float> %__A) {
1027; X86-LABEL: test_mm_mask_cvttps_epu32:
1028; X86:       # %bb.0: # %entry
1029; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
1030; X86-NEXT:    kmovw %eax, %k1
1031; X86-NEXT:    vcvttps2udq %xmm1, %xmm0 {%k1}
1032; X86-NEXT:    retl
1033;
1034; X64-LABEL: test_mm_mask_cvttps_epu32:
1035; X64:       # %bb.0: # %entry
1036; X64-NEXT:    kmovw %edi, %k1
1037; X64-NEXT:    vcvttps2udq %xmm1, %xmm0 {%k1}
1038; X64-NEXT:    retq
1039entry:
1040  %0 = bitcast <2 x i64> %__W to <4 x i32>
1041  %1 = tail call <4 x i32> @llvm.x86.avx512.mask.cvttps2udq.128(<4 x float> %__A, <4 x i32> %0, i8 %__U) #8
1042  %2 = bitcast <4 x i32> %1 to <2 x i64>
1043  ret <2 x i64> %2
1044}
1045
1046define <2 x i64> @test_mm_maskz_cvttps_epu32(i8 zeroext %__U, <4 x float> %__A) {
1047; X86-LABEL: test_mm_maskz_cvttps_epu32:
1048; X86:       # %bb.0: # %entry
1049; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
1050; X86-NEXT:    kmovw %eax, %k1
1051; X86-NEXT:    vcvttps2udq %xmm0, %xmm0 {%k1} {z}
1052; X86-NEXT:    retl
1053;
1054; X64-LABEL: test_mm_maskz_cvttps_epu32:
1055; X64:       # %bb.0: # %entry
1056; X64-NEXT:    kmovw %edi, %k1
1057; X64-NEXT:    vcvttps2udq %xmm0, %xmm0 {%k1} {z}
1058; X64-NEXT:    retq
1059entry:
1060  %0 = tail call <4 x i32> @llvm.x86.avx512.mask.cvttps2udq.128(<4 x float> %__A, <4 x i32> zeroinitializer, i8 %__U) #8
1061  %1 = bitcast <4 x i32> %0 to <2 x i64>
1062  ret <2 x i64> %1
1063}
1064
1065define <4 x i64> @test_mm256_cvttps_epu32(<8 x float> %__A) {
1066; CHECK-LABEL: test_mm256_cvttps_epu32:
1067; CHECK:       # %bb.0: # %entry
1068; CHECK-NEXT:    vcvttps2udq %ymm0, %ymm0
1069; CHECK-NEXT:    ret{{[l|q]}}
1070entry:
1071  %0 = tail call <8 x i32> @llvm.x86.avx512.mask.cvttps2udq.256(<8 x float> %__A, <8 x i32> zeroinitializer, i8 -1) #8
1072  %1 = bitcast <8 x i32> %0 to <4 x i64>
1073  ret <4 x i64> %1
1074}
1075
1076define <4 x i64> @test_mm256_mask_cvttps_epu32(<4 x i64> %__W, i8 zeroext %__U, <8 x float> %__A) {
1077; X86-LABEL: test_mm256_mask_cvttps_epu32:
1078; X86:       # %bb.0: # %entry
1079; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
1080; X86-NEXT:    kmovw %eax, %k1
1081; X86-NEXT:    vcvttps2udq %ymm1, %ymm0 {%k1}
1082; X86-NEXT:    retl
1083;
1084; X64-LABEL: test_mm256_mask_cvttps_epu32:
1085; X64:       # %bb.0: # %entry
1086; X64-NEXT:    kmovw %edi, %k1
1087; X64-NEXT:    vcvttps2udq %ymm1, %ymm0 {%k1}
1088; X64-NEXT:    retq
1089entry:
1090  %0 = bitcast <4 x i64> %__W to <8 x i32>
1091  %1 = tail call <8 x i32> @llvm.x86.avx512.mask.cvttps2udq.256(<8 x float> %__A, <8 x i32> %0, i8 %__U) #8
1092  %2 = bitcast <8 x i32> %1 to <4 x i64>
1093  ret <4 x i64> %2
1094}
1095
1096define <4 x i64> @test_mm256_maskz_cvttps_epu32(i8 zeroext %__U, <8 x float> %__A) {
1097; X86-LABEL: test_mm256_maskz_cvttps_epu32:
1098; X86:       # %bb.0: # %entry
1099; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
1100; X86-NEXT:    kmovw %eax, %k1
1101; X86-NEXT:    vcvttps2udq %ymm0, %ymm0 {%k1} {z}
1102; X86-NEXT:    retl
1103;
1104; X64-LABEL: test_mm256_maskz_cvttps_epu32:
1105; X64:       # %bb.0: # %entry
1106; X64-NEXT:    kmovw %edi, %k1
1107; X64-NEXT:    vcvttps2udq %ymm0, %ymm0 {%k1} {z}
1108; X64-NEXT:    retq
1109entry:
1110  %0 = tail call <8 x i32> @llvm.x86.avx512.mask.cvttps2udq.256(<8 x float> %__A, <8 x i32> zeroinitializer, i8 %__U) #8
1111  %1 = bitcast <8 x i32> %0 to <4 x i64>
1112  ret <4 x i64> %1
1113}
1114
1115define <2 x double> @test_mm_cvtepu32_pd(<2 x i64> %__A) local_unnamed_addr #0 {
1116; CHECK-LABEL: test_mm_cvtepu32_pd:
1117; CHECK:       # %bb.0: # %entry
1118; CHECK-NEXT:    vcvtudq2pd %xmm0, %xmm0
1119; CHECK-NEXT:    ret{{[l|q]}}
1120entry:
1121  %0 = bitcast <2 x i64> %__A to <4 x i32>
1122  %shuffle.i = shufflevector <4 x i32> %0, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
1123  %conv.i = uitofp <2 x i32> %shuffle.i to <2 x double>
1124  ret <2 x double> %conv.i
1125}
1126
1127define <2 x double> @test_mm_mask_cvtepu32_pd(<2 x double> %__W, i8 zeroext %__U, <2 x i64> %__A) local_unnamed_addr #0 {
1128; X86-LABEL: test_mm_mask_cvtepu32_pd:
1129; X86:       # %bb.0: # %entry
1130; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
1131; X86-NEXT:    kmovw %eax, %k1
1132; X86-NEXT:    vcvtudq2pd %xmm1, %xmm0 {%k1}
1133; X86-NEXT:    retl
1134;
1135; X64-LABEL: test_mm_mask_cvtepu32_pd:
1136; X64:       # %bb.0: # %entry
1137; X64-NEXT:    kmovw %edi, %k1
1138; X64-NEXT:    vcvtudq2pd %xmm1, %xmm0 {%k1}
1139; X64-NEXT:    retq
1140entry:
1141  %0 = bitcast <2 x i64> %__A to <4 x i32>
1142  %shuffle.i.i = shufflevector <4 x i32> %0, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
1143  %conv.i.i = uitofp <2 x i32> %shuffle.i.i to <2 x double>
1144  %1 = bitcast i8 %__U to <8 x i1>
1145  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
1146  %2 = select <2 x i1> %extract.i, <2 x double> %conv.i.i, <2 x double> %__W
1147  ret <2 x double> %2
1148}
1149
1150define <2 x double> @test_mm_maskz_cvtepu32_pd(i8 zeroext %__U, <2 x i64> %__A) local_unnamed_addr #0 {
1151; X86-LABEL: test_mm_maskz_cvtepu32_pd:
1152; X86:       # %bb.0: # %entry
1153; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
1154; X86-NEXT:    kmovw %eax, %k1
1155; X86-NEXT:    vcvtudq2pd %xmm0, %xmm0 {%k1} {z}
1156; X86-NEXT:    retl
1157;
1158; X64-LABEL: test_mm_maskz_cvtepu32_pd:
1159; X64:       # %bb.0: # %entry
1160; X64-NEXT:    kmovw %edi, %k1
1161; X64-NEXT:    vcvtudq2pd %xmm0, %xmm0 {%k1} {z}
1162; X64-NEXT:    retq
1163entry:
1164  %0 = bitcast <2 x i64> %__A to <4 x i32>
1165  %shuffle.i.i = shufflevector <4 x i32> %0, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
1166  %conv.i.i = uitofp <2 x i32> %shuffle.i.i to <2 x double>
1167  %1 = bitcast i8 %__U to <8 x i1>
1168  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
1169  %2 = select <2 x i1> %extract.i, <2 x double> %conv.i.i, <2 x double> zeroinitializer
1170  ret <2 x double> %2
1171}
1172
1173define <4 x double> @test_mm256_cvtepu32_pd(<2 x i64> %__A) local_unnamed_addr #0 {
1174; CHECK-LABEL: test_mm256_cvtepu32_pd:
1175; CHECK:       # %bb.0: # %entry
1176; CHECK-NEXT:    vcvtudq2pd %xmm0, %ymm0
1177; CHECK-NEXT:    ret{{[l|q]}}
1178entry:
1179  %0 = bitcast <2 x i64> %__A to <4 x i32>
1180  %conv.i = uitofp <4 x i32> %0 to <4 x double>
1181  ret <4 x double> %conv.i
1182}
1183
1184define <4 x double> @test_mm256_mask_cvtepu32_pd(<4 x double> %__W, i8 zeroext %__U, <2 x i64> %__A) local_unnamed_addr #0 {
1185; X86-LABEL: test_mm256_mask_cvtepu32_pd:
1186; X86:       # %bb.0: # %entry
1187; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
1188; X86-NEXT:    kmovw %eax, %k1
1189; X86-NEXT:    vcvtudq2pd %xmm1, %ymm0 {%k1}
1190; X86-NEXT:    retl
1191;
1192; X64-LABEL: test_mm256_mask_cvtepu32_pd:
1193; X64:       # %bb.0: # %entry
1194; X64-NEXT:    kmovw %edi, %k1
1195; X64-NEXT:    vcvtudq2pd %xmm1, %ymm0 {%k1}
1196; X64-NEXT:    retq
1197entry:
1198  %0 = bitcast <2 x i64> %__A to <4 x i32>
1199  %conv.i.i = uitofp <4 x i32> %0 to <4 x double>
1200  %1 = bitcast i8 %__U to <8 x i1>
1201  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1202  %2 = select <4 x i1> %extract.i, <4 x double> %conv.i.i, <4 x double> %__W
1203  ret <4 x double> %2
1204}
1205
1206define <4 x double> @test_mm256_maskz_cvtepu32_pd(i8 zeroext %__U, <2 x i64> %__A) local_unnamed_addr #0 {
1207; X86-LABEL: test_mm256_maskz_cvtepu32_pd:
1208; X86:       # %bb.0: # %entry
1209; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
1210; X86-NEXT:    kmovw %eax, %k1
1211; X86-NEXT:    vcvtudq2pd %xmm0, %ymm0 {%k1} {z}
1212; X86-NEXT:    retl
1213;
1214; X64-LABEL: test_mm256_maskz_cvtepu32_pd:
1215; X64:       # %bb.0: # %entry
1216; X64-NEXT:    kmovw %edi, %k1
1217; X64-NEXT:    vcvtudq2pd %xmm0, %ymm0 {%k1} {z}
1218; X64-NEXT:    retq
1219entry:
1220  %0 = bitcast <2 x i64> %__A to <4 x i32>
1221  %conv.i.i = uitofp <4 x i32> %0 to <4 x double>
1222  %1 = bitcast i8 %__U to <8 x i1>
1223  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1224  %2 = select <4 x i1> %extract.i, <4 x double> %conv.i.i, <4 x double> zeroinitializer
1225  ret <4 x double> %2
1226}
1227
1228define <4 x float> @test_mm_cvtepu32_ps(<2 x i64> %__A) {
1229; CHECK-LABEL: test_mm_cvtepu32_ps:
1230; CHECK:       # %bb.0: # %entry
1231; CHECK-NEXT:    vcvtudq2ps %xmm0, %xmm0
1232; CHECK-NEXT:    ret{{[l|q]}}
1233entry:
1234  %0 = bitcast <2 x i64> %__A to <4 x i32>
1235  %conv.i = uitofp <4 x i32> %0 to <4 x float>
1236  ret <4 x float> %conv.i
1237}
1238
1239define <4 x float> @test_mm_mask_cvtepu32_ps(<4 x float> %__W, i8 zeroext %__U, <2 x i64> %__A) {
1240; X86-LABEL: test_mm_mask_cvtepu32_ps:
1241; X86:       # %bb.0: # %entry
1242; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
1243; X86-NEXT:    kmovw %eax, %k1
1244; X86-NEXT:    vcvtudq2ps %xmm1, %xmm0 {%k1}
1245; X86-NEXT:    retl
1246;
1247; X64-LABEL: test_mm_mask_cvtepu32_ps:
1248; X64:       # %bb.0: # %entry
1249; X64-NEXT:    kmovw %edi, %k1
1250; X64-NEXT:    vcvtudq2ps %xmm1, %xmm0 {%k1}
1251; X64-NEXT:    retq
1252entry:
1253  %0 = bitcast <2 x i64> %__A to <4 x i32>
1254  %conv.i.i = uitofp <4 x i32> %0 to <4 x float>
1255  %1 = bitcast i8 %__U to <8 x i1>
1256  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1257  %2 = select <4 x i1> %extract.i, <4 x float> %conv.i.i, <4 x float> %__W
1258  ret <4 x float> %2
1259}
1260
1261define <4 x float> @test_mm_maskz_cvtepu32_ps(i8 zeroext %__U, <2 x i64> %__A) {
1262; X86-LABEL: test_mm_maskz_cvtepu32_ps:
1263; X86:       # %bb.0: # %entry
1264; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
1265; X86-NEXT:    kmovw %eax, %k1
1266; X86-NEXT:    vcvtudq2ps %xmm0, %xmm0 {%k1} {z}
1267; X86-NEXT:    retl
1268;
1269; X64-LABEL: test_mm_maskz_cvtepu32_ps:
1270; X64:       # %bb.0: # %entry
1271; X64-NEXT:    kmovw %edi, %k1
1272; X64-NEXT:    vcvtudq2ps %xmm0, %xmm0 {%k1} {z}
1273; X64-NEXT:    retq
1274entry:
1275  %0 = bitcast <2 x i64> %__A to <4 x i32>
1276  %conv.i.i = uitofp <4 x i32> %0 to <4 x float>
1277  %1 = bitcast i8 %__U to <8 x i1>
1278  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1279  %2 = select <4 x i1> %extract.i, <4 x float> %conv.i.i, <4 x float> zeroinitializer
1280  ret <4 x float> %2
1281}
1282
1283define <8 x float> @test_mm256_cvtepu32_ps(<4 x i64> %__A) {
1284; CHECK-LABEL: test_mm256_cvtepu32_ps:
1285; CHECK:       # %bb.0: # %entry
1286; CHECK-NEXT:    vcvtudq2ps %ymm0, %ymm0
1287; CHECK-NEXT:    ret{{[l|q]}}
1288entry:
1289  %0 = bitcast <4 x i64> %__A to <8 x i32>
1290  %conv.i = uitofp <8 x i32> %0 to <8 x float>
1291  ret <8 x float> %conv.i
1292}
1293
1294define <8 x float> @test_mm256_mask_cvtepu32_ps(<8 x float> %__W, i8 zeroext %__U, <4 x i64> %__A) {
1295; X86-LABEL: test_mm256_mask_cvtepu32_ps:
1296; X86:       # %bb.0: # %entry
1297; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
1298; X86-NEXT:    kmovw %eax, %k1
1299; X86-NEXT:    vcvtudq2ps %ymm1, %ymm0 {%k1}
1300; X86-NEXT:    retl
1301;
1302; X64-LABEL: test_mm256_mask_cvtepu32_ps:
1303; X64:       # %bb.0: # %entry
1304; X64-NEXT:    kmovw %edi, %k1
1305; X64-NEXT:    vcvtudq2ps %ymm1, %ymm0 {%k1}
1306; X64-NEXT:    retq
1307entry:
1308  %0 = bitcast <4 x i64> %__A to <8 x i32>
1309  %conv.i.i = uitofp <8 x i32> %0 to <8 x float>
1310  %1 = bitcast i8 %__U to <8 x i1>
1311  %2 = select <8 x i1> %1, <8 x float> %conv.i.i, <8 x float> %__W
1312  ret <8 x float> %2
1313}
1314
1315define <8 x float> @test_mm256_maskz_cvtepu32_ps(i8 zeroext %__U, <4 x i64> %__A) {
1316; X86-LABEL: test_mm256_maskz_cvtepu32_ps:
1317; X86:       # %bb.0: # %entry
1318; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
1319; X86-NEXT:    kmovw %eax, %k1
1320; X86-NEXT:    vcvtudq2ps %ymm0, %ymm0 {%k1} {z}
1321; X86-NEXT:    retl
1322;
1323; X64-LABEL: test_mm256_maskz_cvtepu32_ps:
1324; X64:       # %bb.0: # %entry
1325; X64-NEXT:    kmovw %edi, %k1
1326; X64-NEXT:    vcvtudq2ps %ymm0, %ymm0 {%k1} {z}
1327; X64-NEXT:    retq
1328entry:
1329  %0 = bitcast <4 x i64> %__A to <8 x i32>
1330  %conv.i.i = uitofp <8 x i32> %0 to <8 x float>
1331  %1 = bitcast i8 %__U to <8 x i1>
1332  %2 = select <8 x i1> %1, <8 x float> %conv.i.i, <8 x float> zeroinitializer
1333  ret <8 x float> %2
1334}
1335
1336define <8 x float> @test_mm256_shuffle_f32x4(<8 x float> %__A, <8 x float> %__B) {
1337; CHECK-LABEL: test_mm256_shuffle_f32x4:
1338; CHECK:       # %bb.0: # %entry
1339; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
1340; CHECK-NEXT:    ret{{[l|q]}}
1341entry:
1342  %shuffle = shufflevector <8 x float> %__A, <8 x float> %__B, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
1343  ret <8 x float> %shuffle
1344}
1345
1346define <8 x float> @test_mm256_mask_shuffle_f32x4(<8 x float> %__W, i8 zeroext %__U, <8 x float> %__A, <8 x float> %__B) {
1347; X86-LABEL: test_mm256_mask_shuffle_f32x4:
1348; X86:       # %bb.0: # %entry
1349; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
1350; X86-NEXT:    kmovw %eax, %k1
1351; X86-NEXT:    vshuff32x4 {{.*#+}} ymm0 {%k1} = ymm1[4,5,6,7],ymm2[4,5,6,7]
1352; X86-NEXT:    retl
1353;
1354; X64-LABEL: test_mm256_mask_shuffle_f32x4:
1355; X64:       # %bb.0: # %entry
1356; X64-NEXT:    kmovw %edi, %k1
1357; X64-NEXT:    vshuff32x4 {{.*#+}} ymm0 {%k1} = ymm1[4,5,6,7],ymm2[4,5,6,7]
1358; X64-NEXT:    retq
1359entry:
1360  %shuffle = shufflevector <8 x float> %__A, <8 x float> %__B, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
1361  %0 = bitcast i8 %__U to <8 x i1>
1362  %1 = select <8 x i1> %0, <8 x float> %shuffle, <8 x float> %__W
1363  ret <8 x float> %1
1364}
1365
1366define <8 x float> @test_mm256_maskz_shuffle_f32x4(i8 zeroext %__U, <8 x float> %__A, <8 x float> %__B) {
1367; X86-LABEL: test_mm256_maskz_shuffle_f32x4:
1368; X86:       # %bb.0: # %entry
1369; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
1370; X86-NEXT:    kmovw %eax, %k1
1371; X86-NEXT:    vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7]
1372; X86-NEXT:    retl
1373;
1374; X64-LABEL: test_mm256_maskz_shuffle_f32x4:
1375; X64:       # %bb.0: # %entry
1376; X64-NEXT:    kmovw %edi, %k1
1377; X64-NEXT:    vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7]
1378; X64-NEXT:    retq
1379entry:
1380  %shuffle = shufflevector <8 x float> %__A, <8 x float> %__B, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
1381  %0 = bitcast i8 %__U to <8 x i1>
1382  %1 = select <8 x i1> %0, <8 x float> %shuffle, <8 x float> zeroinitializer
1383  ret <8 x float> %1
1384}
1385
1386define <4 x double> @test_mm256_shuffle_f64x2(<4 x double> %__A, <4 x double> %__B) {
1387; CHECK-LABEL: test_mm256_shuffle_f64x2:
1388; CHECK:       # %bb.0: # %entry
1389; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
1390; CHECK-NEXT:    ret{{[l|q]}}
1391entry:
1392  %shuffle = shufflevector <4 x double> %__A, <4 x double> %__B, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
1393  ret <4 x double> %shuffle
1394}
1395
1396define <4 x double> @test_mm256_mask_shuffle_f64x2(<4 x double> %__W, i8 zeroext %__U, <4 x double> %__A, <4 x double> %__B) {
1397; X86-LABEL: test_mm256_mask_shuffle_f64x2:
1398; X86:       # %bb.0: # %entry
1399; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
1400; X86-NEXT:    kmovw %eax, %k1
1401; X86-NEXT:    vshuff64x2 {{.*#+}} ymm0 {%k1} = ymm1[2,3],ymm2[2,3]
1402; X86-NEXT:    retl
1403;
1404; X64-LABEL: test_mm256_mask_shuffle_f64x2:
1405; X64:       # %bb.0: # %entry
1406; X64-NEXT:    kmovw %edi, %k1
1407; X64-NEXT:    vshuff64x2 {{.*#+}} ymm0 {%k1} = ymm1[2,3],ymm2[2,3]
1408; X64-NEXT:    retq
1409entry:
1410  %shuffle = shufflevector <4 x double> %__A, <4 x double> %__B, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
1411  %0 = bitcast i8 %__U to <8 x i1>
1412  %extract = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1413  %1 = select <4 x i1> %extract, <4 x double> %shuffle, <4 x double> %__W
1414  ret <4 x double> %1
1415}
1416
1417define <4 x double> @test_mm256_maskz_shuffle_f64x2(i8 zeroext %__U, <4 x double> %__A, <4 x double> %__B) {
1418; X86-LABEL: test_mm256_maskz_shuffle_f64x2:
1419; X86:       # %bb.0: # %entry
1420; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
1421; X86-NEXT:    kmovw %eax, %k1
1422; X86-NEXT:    vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3]
1423; X86-NEXT:    retl
1424;
1425; X64-LABEL: test_mm256_maskz_shuffle_f64x2:
1426; X64:       # %bb.0: # %entry
1427; X64-NEXT:    kmovw %edi, %k1
1428; X64-NEXT:    vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3]
1429; X64-NEXT:    retq
1430entry:
1431  %shuffle = shufflevector <4 x double> %__A, <4 x double> %__B, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
1432  %0 = bitcast i8 %__U to <8 x i1>
1433  %extract = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1434  %1 = select <4 x i1> %extract, <4 x double> %shuffle, <4 x double> zeroinitializer
1435  ret <4 x double> %1
1436}
1437
1438define <4 x i64> @test_mm256_shuffle_i32x4(<4 x i64> %__A, <4 x i64> %__B) {
1439; CHECK-LABEL: test_mm256_shuffle_i32x4:
1440; CHECK:       # %bb.0: # %entry
1441; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
1442; CHECK-NEXT:    ret{{[l|q]}}
1443entry:
1444  %shuffle = shufflevector <4 x i64> %__A, <4 x i64> %__B, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
1445  ret <4 x i64> %shuffle
1446}
1447
1448define <4 x i64> @test_mm256_mask_shuffle_i32x4(<4 x i64> %__W, i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
1449; X86-LABEL: test_mm256_mask_shuffle_i32x4:
1450; X86:       # %bb.0: # %entry
1451; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
1452; X86-NEXT:    kmovw %eax, %k1
1453; X86-NEXT:    vshufi32x4 {{.*#+}} ymm0 {%k1} = ymm1[4,5,6,7],ymm2[4,5,6,7]
1454; X86-NEXT:    retl
1455;
1456; X64-LABEL: test_mm256_mask_shuffle_i32x4:
1457; X64:       # %bb.0: # %entry
1458; X64-NEXT:    kmovw %edi, %k1
1459; X64-NEXT:    vshufi32x4 {{.*#+}} ymm0 {%k1} = ymm1[4,5,6,7],ymm2[4,5,6,7]
1460; X64-NEXT:    retq
1461entry:
1462  %shuffle = shufflevector <4 x i64> %__A, <4 x i64> %__B, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
1463  %0 = bitcast <4 x i64> %shuffle to <8 x i32>
1464  %1 = bitcast <4 x i64> %__W to <8 x i32>
1465  %2 = bitcast i8 %__U to <8 x i1>
1466  %3 = select <8 x i1> %2, <8 x i32> %0, <8 x i32> %1
1467  %4 = bitcast <8 x i32> %3 to <4 x i64>
1468  ret <4 x i64> %4
1469}
1470
1471define <4 x i64> @test_mm256_maskz_shuffle_i32x4(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
1472; X86-LABEL: test_mm256_maskz_shuffle_i32x4:
1473; X86:       # %bb.0: # %entry
1474; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
1475; X86-NEXT:    kmovw %eax, %k1
1476; X86-NEXT:    vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7]
1477; X86-NEXT:    retl
1478;
1479; X64-LABEL: test_mm256_maskz_shuffle_i32x4:
1480; X64:       # %bb.0: # %entry
1481; X64-NEXT:    kmovw %edi, %k1
1482; X64-NEXT:    vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7]
1483; X64-NEXT:    retq
1484entry:
1485  %shuffle = shufflevector <4 x i64> %__A, <4 x i64> %__B, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
1486  %0 = bitcast <4 x i64> %shuffle to <8 x i32>
1487  %1 = bitcast i8 %__U to <8 x i1>
1488  %2 = select <8 x i1> %1, <8 x i32> %0, <8 x i32> zeroinitializer
1489  %3 = bitcast <8 x i32> %2 to <4 x i64>
1490  ret <4 x i64> %3
1491}
1492
1493define <4 x i64> @test_mm256_shuffle_i64x2(<4 x i64> %__A, <4 x i64> %__B) {
1494; CHECK-LABEL: test_mm256_shuffle_i64x2:
1495; CHECK:       # %bb.0: # %entry
1496; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
1497; CHECK-NEXT:    ret{{[l|q]}}
1498entry:
1499  %shuffle = shufflevector <4 x i64> %__A, <4 x i64> %__B, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
1500  ret <4 x i64> %shuffle
1501}
1502
1503define <4 x i64> @test_mm256_mask_shuffle_i64x2(<4 x i64> %__W, i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
1504; X86-LABEL: test_mm256_mask_shuffle_i64x2:
1505; X86:       # %bb.0: # %entry
1506; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
1507; X86-NEXT:    kmovw %eax, %k1
1508; X86-NEXT:    vshufi64x2 {{.*#+}} ymm0 {%k1} = ymm1[2,3],ymm2[2,3]
1509; X86-NEXT:    retl
1510;
1511; X64-LABEL: test_mm256_mask_shuffle_i64x2:
1512; X64:       # %bb.0: # %entry
1513; X64-NEXT:    kmovw %edi, %k1
1514; X64-NEXT:    vshufi64x2 {{.*#+}} ymm0 {%k1} = ymm1[2,3],ymm2[2,3]
1515; X64-NEXT:    retq
1516entry:
1517  %shuffle = shufflevector <4 x i64> %__A, <4 x i64> %__B, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
1518  %0 = bitcast i8 %__U to <8 x i1>
1519  %extract = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1520  %1 = select <4 x i1> %extract, <4 x i64> %shuffle, <4 x i64> %__W
1521  ret <4 x i64> %1
1522}
1523
1524define <4 x i64> @test_mm256_maskz_shuffle_i64x2(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
1525; X86-LABEL: test_mm256_maskz_shuffle_i64x2:
1526; X86:       # %bb.0: # %entry
1527; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
1528; X86-NEXT:    kmovw %eax, %k1
1529; X86-NEXT:    vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3]
1530; X86-NEXT:    retl
1531;
1532; X64-LABEL: test_mm256_maskz_shuffle_i64x2:
1533; X64:       # %bb.0: # %entry
1534; X64-NEXT:    kmovw %edi, %k1
1535; X64-NEXT:    vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3]
1536; X64-NEXT:    retq
1537entry:
1538  %shuffle = shufflevector <4 x i64> %__A, <4 x i64> %__B, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
1539  %0 = bitcast i8 %__U to <8 x i1>
1540  %extract = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1541  %1 = select <4 x i1> %extract, <4 x i64> %shuffle, <4 x i64> zeroinitializer
1542  ret <4 x i64> %1
1543}
1544
1545define zeroext i8 @test_mm_test_epi32_mask(<2 x i64> %__A, <2 x i64> %__B) {
1546; CHECK-LABEL: test_mm_test_epi32_mask:
1547; CHECK:       # %bb.0: # %entry
1548; CHECK-NEXT:    vptestmd %xmm0, %xmm1, %k0
1549; CHECK-NEXT:    kmovw %k0, %eax
1550; CHECK-NEXT:    movzbl %al, %eax
1551; CHECK-NEXT:    ret{{[l|q]}}
1552entry:
1553  %and.i.i = and <2 x i64> %__B, %__A
1554  %0 = bitcast <2 x i64> %and.i.i to <4 x i32>
1555  %1 = icmp ne <4 x i32> %0, zeroinitializer
1556  %2 = shufflevector <4 x i1> %1, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1557  %3 = bitcast <8 x i1> %2 to i8
1558  ret i8 %3
1559}
1560
1561define zeroext i8 @test_mm_mask_test_epi32_mask(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) {
1562; X86-LABEL: test_mm_mask_test_epi32_mask:
1563; X86:       # %bb.0: # %entry
1564; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
1565; X86-NEXT:    kmovw %eax, %k1
1566; X86-NEXT:    vptestmd %xmm0, %xmm1, %k0 {%k1}
1567; X86-NEXT:    kmovw %k0, %eax
1568; X86-NEXT:    movzbl %al, %eax
1569; X86-NEXT:    retl
1570;
1571; X64-LABEL: test_mm_mask_test_epi32_mask:
1572; X64:       # %bb.0: # %entry
1573; X64-NEXT:    kmovw %edi, %k1
1574; X64-NEXT:    vptestmd %xmm0, %xmm1, %k0 {%k1}
1575; X64-NEXT:    kmovw %k0, %eax
1576; X64-NEXT:    movzbl %al, %eax
1577; X64-NEXT:    retq
1578entry:
1579  %and.i.i = and <2 x i64> %__B, %__A
1580  %0 = bitcast <2 x i64> %and.i.i to <4 x i32>
1581  %1 = icmp ne <4 x i32> %0, zeroinitializer
1582  %2 = bitcast i8 %__U to <8 x i1>
1583  %extract.i = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1584  %3 = and <4 x i1> %1, %extract.i
1585  %4 = shufflevector <4 x i1> %3, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1586  %5 = bitcast <8 x i1> %4 to i8
1587  ret i8 %5
1588}
1589
1590define zeroext i8 @test_mm256_test_epi32_mask(<4 x i64> %__A, <4 x i64> %__B) {
1591; CHECK-LABEL: test_mm256_test_epi32_mask:
1592; CHECK:       # %bb.0: # %entry
1593; CHECK-NEXT:    vptestmd %ymm0, %ymm1, %k0
1594; CHECK-NEXT:    kmovw %k0, %eax
1595; CHECK-NEXT:    movzbl %al, %eax
1596; CHECK-NEXT:    vzeroupper
1597; CHECK-NEXT:    ret{{[l|q]}}
1598entry:
1599  %and.i.i = and <4 x i64> %__B, %__A
1600  %0 = bitcast <4 x i64> %and.i.i to <8 x i32>
1601  %1 = icmp ne <8 x i32> %0, zeroinitializer
1602  %2 = bitcast <8 x i1> %1 to i8
1603  ret i8 %2
1604}
1605
1606define zeroext i8 @test_mm256_mask_test_epi32_mask(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
1607; X86-LABEL: test_mm256_mask_test_epi32_mask:
1608; X86:       # %bb.0: # %entry
1609; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
1610; X86-NEXT:    kmovw %eax, %k1
1611; X86-NEXT:    vptestmd %ymm0, %ymm1, %k0 {%k1}
1612; X86-NEXT:    kmovw %k0, %eax
1613; X86-NEXT:    movzbl %al, %eax
1614; X86-NEXT:    vzeroupper
1615; X86-NEXT:    retl
1616;
1617; X64-LABEL: test_mm256_mask_test_epi32_mask:
1618; X64:       # %bb.0: # %entry
1619; X64-NEXT:    kmovw %edi, %k1
1620; X64-NEXT:    vptestmd %ymm0, %ymm1, %k0 {%k1}
1621; X64-NEXT:    kmovw %k0, %eax
1622; X64-NEXT:    movzbl %al, %eax
1623; X64-NEXT:    vzeroupper
1624; X64-NEXT:    retq
1625entry:
1626  %and.i.i = and <4 x i64> %__B, %__A
1627  %0 = bitcast <4 x i64> %and.i.i to <8 x i32>
1628  %1 = icmp ne <8 x i32> %0, zeroinitializer
1629  %2 = bitcast i8 %__U to <8 x i1>
1630  %3 = and <8 x i1> %1, %2
1631  %4 = bitcast <8 x i1> %3 to i8
1632  ret i8 %4
1633}
1634
1635define zeroext i8 @test_mm_test_epi64_mask(<2 x i64> %__A, <2 x i64> %__B) {
1636; CHECK-LABEL: test_mm_test_epi64_mask:
1637; CHECK:       # %bb.0: # %entry
1638; CHECK-NEXT:    vptestmq %xmm0, %xmm1, %k0
1639; CHECK-NEXT:    kmovw %k0, %eax
1640; CHECK-NEXT:    movzbl %al, %eax
1641; CHECK-NEXT:    ret{{[l|q]}}
1642entry:
1643  %and.i.i = and <2 x i64> %__B, %__A
1644  %0 = icmp ne <2 x i64> %and.i.i, zeroinitializer
1645  %1 = shufflevector <2 x i1> %0, <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
1646  %2 = bitcast <8 x i1> %1 to i8
1647  ret i8 %2
1648}
1649
1650define zeroext i8 @test_mm_mask_test_epi64_mask(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) {
1651; X86-LABEL: test_mm_mask_test_epi64_mask:
1652; X86:       # %bb.0: # %entry
1653; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
1654; X86-NEXT:    kmovw %eax, %k1
1655; X86-NEXT:    vptestmq %xmm0, %xmm1, %k0 {%k1}
1656; X86-NEXT:    kmovw %k0, %eax
1657; X86-NEXT:    movzbl %al, %eax
1658; X86-NEXT:    retl
1659;
1660; X64-LABEL: test_mm_mask_test_epi64_mask:
1661; X64:       # %bb.0: # %entry
1662; X64-NEXT:    kmovw %edi, %k1
1663; X64-NEXT:    vptestmq %xmm0, %xmm1, %k0 {%k1}
1664; X64-NEXT:    kmovw %k0, %eax
1665; X64-NEXT:    movzbl %al, %eax
1666; X64-NEXT:    retq
1667entry:
1668  %and.i.i = and <2 x i64> %__B, %__A
1669  %0 = icmp ne <2 x i64> %and.i.i, zeroinitializer
1670  %1 = bitcast i8 %__U to <8 x i1>
1671  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
1672  %2 = and <2 x i1> %0, %extract.i
1673  %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
1674  %4 = bitcast <8 x i1> %3 to i8
1675  ret i8 %4
1676}
1677
1678define zeroext i8 @test_mm256_test_epi64_mask(<4 x i64> %__A, <4 x i64> %__B) {
1679; CHECK-LABEL: test_mm256_test_epi64_mask:
1680; CHECK:       # %bb.0: # %entry
1681; CHECK-NEXT:    vptestmq %ymm0, %ymm1, %k0
1682; CHECK-NEXT:    kmovw %k0, %eax
1683; CHECK-NEXT:    movzbl %al, %eax
1684; CHECK-NEXT:    vzeroupper
1685; CHECK-NEXT:    ret{{[l|q]}}
1686entry:
1687  %and.i.i = and <4 x i64> %__B, %__A
1688  %0 = icmp ne <4 x i64> %and.i.i, zeroinitializer
1689  %1 = shufflevector <4 x i1> %0, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1690  %2 = bitcast <8 x i1> %1 to i8
1691  ret i8 %2
1692}
1693
1694define zeroext i8 @test_mm256_mask_test_epi64_mask(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
1695; X86-LABEL: test_mm256_mask_test_epi64_mask:
1696; X86:       # %bb.0: # %entry
1697; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
1698; X86-NEXT:    kmovw %eax, %k1
1699; X86-NEXT:    vptestmq %ymm0, %ymm1, %k0 {%k1}
1700; X86-NEXT:    kmovw %k0, %eax
1701; X86-NEXT:    movzbl %al, %eax
1702; X86-NEXT:    vzeroupper
1703; X86-NEXT:    retl
1704;
1705; X64-LABEL: test_mm256_mask_test_epi64_mask:
1706; X64:       # %bb.0: # %entry
1707; X64-NEXT:    kmovw %edi, %k1
1708; X64-NEXT:    vptestmq %ymm0, %ymm1, %k0 {%k1}
1709; X64-NEXT:    kmovw %k0, %eax
1710; X64-NEXT:    movzbl %al, %eax
1711; X64-NEXT:    vzeroupper
1712; X64-NEXT:    retq
1713entry:
1714  %and.i.i = and <4 x i64> %__B, %__A
1715  %0 = icmp ne <4 x i64> %and.i.i, zeroinitializer
1716  %1 = bitcast i8 %__U to <8 x i1>
1717  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1718  %2 = and <4 x i1> %0, %extract.i
1719  %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1720  %4 = bitcast <8 x i1> %3 to i8
1721  ret i8 %4
1722}
1723
1724define zeroext i8 @test_mm_testn_epi32_mask(<2 x i64> %__A, <2 x i64> %__B) {
1725; CHECK-LABEL: test_mm_testn_epi32_mask:
1726; CHECK:       # %bb.0: # %entry
1727; CHECK-NEXT:    vptestnmd %xmm0, %xmm1, %k0
1728; CHECK-NEXT:    kmovw %k0, %eax
1729; CHECK-NEXT:    movzbl %al, %eax
1730; CHECK-NEXT:    ret{{[l|q]}}
1731entry:
1732  %and.i.i = and <2 x i64> %__B, %__A
1733  %0 = bitcast <2 x i64> %and.i.i to <4 x i32>
1734  %1 = icmp eq <4 x i32> %0, zeroinitializer
1735  %2 = shufflevector <4 x i1> %1, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1736  %3 = bitcast <8 x i1> %2 to i8
1737  ret i8 %3
1738}
1739
1740define zeroext i8 @test_mm_mask_testn_epi32_mask(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) {
1741; X86-LABEL: test_mm_mask_testn_epi32_mask:
1742; X86:       # %bb.0: # %entry
1743; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
1744; X86-NEXT:    kmovw %eax, %k1
1745; X86-NEXT:    vptestnmd %xmm0, %xmm1, %k0 {%k1}
1746; X86-NEXT:    kmovw %k0, %eax
1747; X86-NEXT:    movzbl %al, %eax
1748; X86-NEXT:    retl
1749;
1750; X64-LABEL: test_mm_mask_testn_epi32_mask:
1751; X64:       # %bb.0: # %entry
1752; X64-NEXT:    kmovw %edi, %k1
1753; X64-NEXT:    vptestnmd %xmm0, %xmm1, %k0 {%k1}
1754; X64-NEXT:    kmovw %k0, %eax
1755; X64-NEXT:    movzbl %al, %eax
1756; X64-NEXT:    retq
1757entry:
1758  %and.i.i = and <2 x i64> %__B, %__A
1759  %0 = bitcast <2 x i64> %and.i.i to <4 x i32>
1760  %1 = icmp eq <4 x i32> %0, zeroinitializer
1761  %2 = bitcast i8 %__U to <8 x i1>
1762  %extract.i = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1763  %3 = and <4 x i1> %1, %extract.i
1764  %4 = shufflevector <4 x i1> %3, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1765  %5 = bitcast <8 x i1> %4 to i8
1766  ret i8 %5
1767}
1768
1769define zeroext i8 @test_mm256_testn_epi32_mask(<4 x i64> %__A, <4 x i64> %__B) {
1770; CHECK-LABEL: test_mm256_testn_epi32_mask:
1771; CHECK:       # %bb.0: # %entry
1772; CHECK-NEXT:    vptestnmd %ymm0, %ymm1, %k0
1773; CHECK-NEXT:    kmovw %k0, %eax
1774; CHECK-NEXT:    movzbl %al, %eax
1775; CHECK-NEXT:    vzeroupper
1776; CHECK-NEXT:    ret{{[l|q]}}
1777entry:
1778  %and.i.i = and <4 x i64> %__B, %__A
1779  %0 = bitcast <4 x i64> %and.i.i to <8 x i32>
1780  %1 = icmp eq <8 x i32> %0, zeroinitializer
1781  %2 = bitcast <8 x i1> %1 to i8
1782  ret i8 %2
1783}
1784
1785define zeroext i8 @test_mm256_mask_testn_epi32_mask(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
1786; X86-LABEL: test_mm256_mask_testn_epi32_mask:
1787; X86:       # %bb.0: # %entry
1788; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
1789; X86-NEXT:    kmovw %eax, %k1
1790; X86-NEXT:    vptestnmd %ymm0, %ymm1, %k0 {%k1}
1791; X86-NEXT:    kmovw %k0, %eax
1792; X86-NEXT:    movzbl %al, %eax
1793; X86-NEXT:    vzeroupper
1794; X86-NEXT:    retl
1795;
1796; X64-LABEL: test_mm256_mask_testn_epi32_mask:
1797; X64:       # %bb.0: # %entry
1798; X64-NEXT:    kmovw %edi, %k1
1799; X64-NEXT:    vptestnmd %ymm0, %ymm1, %k0 {%k1}
1800; X64-NEXT:    kmovw %k0, %eax
1801; X64-NEXT:    movzbl %al, %eax
1802; X64-NEXT:    vzeroupper
1803; X64-NEXT:    retq
1804entry:
1805  %and.i.i = and <4 x i64> %__B, %__A
1806  %0 = bitcast <4 x i64> %and.i.i to <8 x i32>
1807  %1 = icmp eq <8 x i32> %0, zeroinitializer
1808  %2 = bitcast i8 %__U to <8 x i1>
1809  %3 = and <8 x i1> %1, %2
1810  %4 = bitcast <8 x i1> %3 to i8
1811  ret i8 %4
1812}
1813
1814define zeroext i8 @test_mm_testn_epi64_mask(<2 x i64> %__A, <2 x i64> %__B) {
1815; CHECK-LABEL: test_mm_testn_epi64_mask:
1816; CHECK:       # %bb.0: # %entry
1817; CHECK-NEXT:    vptestnmq %xmm0, %xmm1, %k0
1818; CHECK-NEXT:    kmovw %k0, %eax
1819; CHECK-NEXT:    movzbl %al, %eax
1820; CHECK-NEXT:    ret{{[l|q]}}
1821entry:
1822  %and.i.i = and <2 x i64> %__B, %__A
1823  %0 = icmp eq <2 x i64> %and.i.i, zeroinitializer
1824  %1 = shufflevector <2 x i1> %0, <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
1825  %2 = bitcast <8 x i1> %1 to i8
1826  ret i8 %2
1827}
1828
1829define zeroext i8 @test_mm_mask_testn_epi64_mask(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) {
1830; X86-LABEL: test_mm_mask_testn_epi64_mask:
1831; X86:       # %bb.0: # %entry
1832; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
1833; X86-NEXT:    kmovw %eax, %k1
1834; X86-NEXT:    vptestnmq %xmm0, %xmm1, %k0 {%k1}
1835; X86-NEXT:    kmovw %k0, %eax
1836; X86-NEXT:    movzbl %al, %eax
1837; X86-NEXT:    retl
1838;
1839; X64-LABEL: test_mm_mask_testn_epi64_mask:
1840; X64:       # %bb.0: # %entry
1841; X64-NEXT:    kmovw %edi, %k1
1842; X64-NEXT:    vptestnmq %xmm0, %xmm1, %k0 {%k1}
1843; X64-NEXT:    kmovw %k0, %eax
1844; X64-NEXT:    movzbl %al, %eax
1845; X64-NEXT:    retq
1846entry:
1847  %and.i.i = and <2 x i64> %__B, %__A
1848  %0 = icmp eq <2 x i64> %and.i.i, zeroinitializer
1849  %1 = bitcast i8 %__U to <8 x i1>
1850  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
1851  %2 = and <2 x i1> %0, %extract.i
1852  %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
1853  %4 = bitcast <8 x i1> %3 to i8
1854  ret i8 %4
1855}
1856
1857define zeroext i8 @test_mm256_testn_epi64_mask(<4 x i64> %__A, <4 x i64> %__B) {
1858; CHECK-LABEL: test_mm256_testn_epi64_mask:
1859; CHECK:       # %bb.0: # %entry
1860; CHECK-NEXT:    vptestnmq %ymm0, %ymm1, %k0
1861; CHECK-NEXT:    kmovw %k0, %eax
1862; CHECK-NEXT:    movzbl %al, %eax
1863; CHECK-NEXT:    vzeroupper
1864; CHECK-NEXT:    ret{{[l|q]}}
1865entry:
1866  %and.i.i = and <4 x i64> %__B, %__A
1867  %0 = icmp eq <4 x i64> %and.i.i, zeroinitializer
1868  %1 = shufflevector <4 x i1> %0, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1869  %2 = bitcast <8 x i1> %1 to i8
1870  ret i8 %2
1871}
1872
1873define zeroext i8 @test_mm256_mask_testn_epi64_mask(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
1874; X86-LABEL: test_mm256_mask_testn_epi64_mask:
1875; X86:       # %bb.0: # %entry
1876; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
1877; X86-NEXT:    kmovw %eax, %k1
1878; X86-NEXT:    vptestnmq %ymm0, %ymm1, %k0 {%k1}
1879; X86-NEXT:    kmovw %k0, %eax
1880; X86-NEXT:    movzbl %al, %eax
1881; X86-NEXT:    vzeroupper
1882; X86-NEXT:    retl
1883;
1884; X64-LABEL: test_mm256_mask_testn_epi64_mask:
1885; X64:       # %bb.0: # %entry
1886; X64-NEXT:    kmovw %edi, %k1
1887; X64-NEXT:    vptestnmq %ymm0, %ymm1, %k0 {%k1}
1888; X64-NEXT:    kmovw %k0, %eax
1889; X64-NEXT:    movzbl %al, %eax
1890; X64-NEXT:    vzeroupper
1891; X64-NEXT:    retq
1892entry:
1893  %and.i.i = and <4 x i64> %__B, %__A
1894  %0 = icmp eq <4 x i64> %and.i.i, zeroinitializer
1895  %1 = bitcast i8 %__U to <8 x i1>
1896  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1897  %2 = and <4 x i1> %0, %extract.i
1898  %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1899  %4 = bitcast <8 x i1> %3 to i8
1900  ret i8 %4
1901}
1902
1903define <2 x i64> @test_mm_mask_set1_epi32(<2 x i64> %__O, i8 zeroext %__M)  {
1904; X86-LABEL: test_mm_mask_set1_epi32:
1905; X86:       # %bb.0: # %entry
1906; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
1907; X86-NEXT:    kmovw %eax, %k1
1908; X86-NEXT:    vpbroadcastd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 {%k1}
1909; X86-NEXT:    retl
1910;
1911; X64-LABEL: test_mm_mask_set1_epi32:
1912; X64:       # %bb.0: # %entry
1913; X64-NEXT:    kmovw %edi, %k1
1914; X64-NEXT:    vpbroadcastd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 {%k1}
1915; X64-NEXT:    retq
1916entry:
1917  %0 = bitcast <2 x i64> %__O to <4 x i32>
1918  %1 = bitcast i8 %__M to <8 x i1>
1919  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1920  %2 = select <4 x i1> %extract.i, <4 x i32> <i32 5, i32 5, i32 5, i32 5>, <4 x i32> %0
1921  %3 = bitcast <4 x i32> %2 to <2 x i64>
1922  ret <2 x i64> %3
1923}
1924
1925define <2 x i64> @test_mm_maskz_set1_epi32(i8 zeroext %__M) {
1926; X86-LABEL: test_mm_maskz_set1_epi32:
1927; X86:       # %bb.0: # %entry
1928; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
1929; X86-NEXT:    kmovw %eax, %k1
1930; X86-NEXT:    vpbroadcastd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 {%k1} {z}
1931; X86-NEXT:    retl
1932;
1933; X64-LABEL: test_mm_maskz_set1_epi32:
1934; X64:       # %bb.0: # %entry
1935; X64-NEXT:    kmovw %edi, %k1
1936; X64-NEXT:    vpbroadcastd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 {%k1} {z}
1937; X64-NEXT:    retq
1938entry:
1939  %0 = bitcast i8 %__M to <8 x i1>
1940  %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1941  %1 = select <4 x i1> %extract.i, <4 x i32> <i32 5, i32 5, i32 5, i32 5>, <4 x i32> zeroinitializer
1942  %2 = bitcast <4 x i32> %1 to <2 x i64>
1943  ret <2 x i64> %2
1944}
1945
1946define <4 x i64> @test_mm256_mask_set1_epi32(<4 x i64> %__O, i8 zeroext %__M)  {
1947; X86-LABEL: test_mm256_mask_set1_epi32:
1948; X86:       # %bb.0: # %entry
1949; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
1950; X86-NEXT:    kmovw %eax, %k1
1951; X86-NEXT:    vpbroadcastd {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0 {%k1}
1952; X86-NEXT:    retl
1953;
1954; X64-LABEL: test_mm256_mask_set1_epi32:
1955; X64:       # %bb.0: # %entry
1956; X64-NEXT:    kmovw %edi, %k1
1957; X64-NEXT:    vpbroadcastd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 {%k1}
1958; X64-NEXT:    retq
1959entry:
1960  %0 = bitcast <4 x i64> %__O to <8 x i32>
1961  %1 = bitcast i8 %__M to <8 x i1>
1962  %2 = select <8 x i1> %1, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>, <8 x i32> %0
1963  %3 = bitcast <8 x i32> %2 to <4 x i64>
1964  ret <4 x i64> %3
1965}
1966
1967define <4 x i64> @test_mm256_maskz_set1_epi32(i8 zeroext %__M)  {
1968; X86-LABEL: test_mm256_maskz_set1_epi32:
1969; X86:       # %bb.0: # %entry
1970; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
1971; X86-NEXT:    kmovw %eax, %k1
1972; X86-NEXT:    vpbroadcastd {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0 {%k1} {z}
1973; X86-NEXT:    retl
1974;
1975; X64-LABEL: test_mm256_maskz_set1_epi32:
1976; X64:       # %bb.0: # %entry
1977; X64-NEXT:    kmovw %edi, %k1
1978; X64-NEXT:    vpbroadcastd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 {%k1} {z}
1979; X64-NEXT:    retq
1980entry:
1981  %0 = bitcast i8 %__M to <8 x i1>
1982  %1 = select <8 x i1> %0, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>, <8 x i32> zeroinitializer
1983  %2 = bitcast <8 x i32> %1 to <4 x i64>
1984  ret <4 x i64> %2
1985}
1986
1987define <2 x i64> @test_mm_mask_set1_epi64(<2 x i64> %__O, i8 zeroext %__M, i64 %__A)  {
1988; X86-LABEL: test_mm_mask_set1_epi64:
1989; X86:       # %bb.0: # %entry
1990; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
1991; X86-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1992; X86-NEXT:    vpinsrd $1, {{[0-9]+}}(%esp), %xmm1, %xmm1
1993; X86-NEXT:    kmovw %eax, %k1
1994; X86-NEXT:    vpbroadcastq %xmm1, %xmm0 {%k1}
1995; X86-NEXT:    retl
1996;
1997; X64-LABEL: test_mm_mask_set1_epi64:
1998; X64:       # %bb.0: # %entry
1999; X64-NEXT:    kmovw %edi, %k1
2000; X64-NEXT:    vpbroadcastq %rsi, %xmm0 {%k1}
2001; X64-NEXT:    retq
2002entry:
2003  %vecinit.i.i.i = insertelement <2 x i64> undef, i64 %__A, i32 0
2004  %vecinit1.i.i.i = shufflevector <2 x i64> %vecinit.i.i.i, <2 x i64> undef, <2 x i32> zeroinitializer
2005  %0 = bitcast i8 %__M to <8 x i1>
2006  %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
2007  %1 = select <2 x i1> %extract.i, <2 x i64> %vecinit1.i.i.i, <2 x i64> %__O
2008  ret <2 x i64> %1
2009}
2010
2011define <2 x i64> @test_mm_maskz_set1_epi64(i8 zeroext %__M, i64 %__A)  {
2012; X86-LABEL: test_mm_maskz_set1_epi64:
2013; X86:       # %bb.0: # %entry
2014; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
2015; X86-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2016; X86-NEXT:    vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
2017; X86-NEXT:    kmovw %eax, %k1
2018; X86-NEXT:    vpbroadcastq %xmm0, %xmm0 {%k1} {z}
2019; X86-NEXT:    retl
2020;
2021; X64-LABEL: test_mm_maskz_set1_epi64:
2022; X64:       # %bb.0: # %entry
2023; X64-NEXT:    kmovw %edi, %k1
2024; X64-NEXT:    vpbroadcastq %rsi, %xmm0 {%k1} {z}
2025; X64-NEXT:    retq
2026entry:
2027  %vecinit.i.i.i = insertelement <2 x i64> undef, i64 %__A, i32 0
2028  %vecinit1.i.i.i = shufflevector <2 x i64> %vecinit.i.i.i, <2 x i64> undef, <2 x i32> zeroinitializer
2029  %0 = bitcast i8 %__M to <8 x i1>
2030  %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
2031  %1 = select <2 x i1> %extract.i, <2 x i64> %vecinit1.i.i.i, <2 x i64> zeroinitializer
2032  ret <2 x i64> %1
2033}
2034
2035
2036define <4 x i64> @test_mm256_mask_set1_epi64(<4 x i64> %__O, i8 zeroext %__M, i64 %__A) {
2037; X86-LABEL: test_mm256_mask_set1_epi64:
2038; X86:       # %bb.0: # %entry
2039; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
2040; X86-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
2041; X86-NEXT:    vpinsrd $1, {{[0-9]+}}(%esp), %xmm1, %xmm1
2042; X86-NEXT:    kmovw %eax, %k1
2043; X86-NEXT:    vpbroadcastq %xmm1, %ymm0 {%k1}
2044; X86-NEXT:    retl
2045;
2046; X64-LABEL: test_mm256_mask_set1_epi64:
2047; X64:       # %bb.0: # %entry
2048; X64-NEXT:    kmovw %edi, %k1
2049; X64-NEXT:    vpbroadcastq %rsi, %ymm0 {%k1}
2050; X64-NEXT:    retq
2051entry:
2052  %vecinit.i.i = insertelement <4 x i64> undef, i64 %__A, i32 0
2053  %vecinit3.i.i = shufflevector <4 x i64> %vecinit.i.i, <4 x i64> undef, <4 x i32> zeroinitializer
2054  %0 = bitcast i8 %__M to <8 x i1>
2055  %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2056  %1 = select <4 x i1> %extract.i, <4 x i64> %vecinit3.i.i, <4 x i64> %__O
2057  ret <4 x i64> %1
2058}
2059
2060define <4 x i64> @test_mm256_maskz_set1_epi64(i8 zeroext %__M, i64 %__A)  {
2061; X86-LABEL: test_mm256_maskz_set1_epi64:
2062; X86:       # %bb.0: # %entry
2063; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
2064; X86-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2065; X86-NEXT:    vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
2066; X86-NEXT:    kmovw %eax, %k1
2067; X86-NEXT:    vpbroadcastq %xmm0, %ymm0 {%k1} {z}
2068; X86-NEXT:    retl
2069;
2070; X64-LABEL: test_mm256_maskz_set1_epi64:
2071; X64:       # %bb.0: # %entry
2072; X64-NEXT:    kmovw %edi, %k1
2073; X64-NEXT:    vpbroadcastq %rsi, %ymm0 {%k1} {z}
2074; X64-NEXT:    retq
2075entry:
2076  %vecinit.i.i = insertelement <4 x i64> undef, i64 %__A, i32 0
2077  %vecinit3.i.i = shufflevector <4 x i64> %vecinit.i.i, <4 x i64> undef, <4 x i32> zeroinitializer
2078  %0 = bitcast i8 %__M to <8 x i1>
2079  %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2080  %1 = select <4 x i1> %extract.i, <4 x i64> %vecinit3.i.i, <4 x i64> zeroinitializer
2081  ret <4 x i64> %1
2082}
2083
2084define <2 x i64> @test_mm_broadcastd_epi32(<2 x i64> %a0) {
2085; CHECK-LABEL: test_mm_broadcastd_epi32:
2086; CHECK:       # %bb.0:
2087; CHECK-NEXT:    vbroadcastss %xmm0, %xmm0
2088; CHECK-NEXT:    ret{{[l|q]}}
2089  %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
2090  %res0 = shufflevector <4 x i32> %arg0, <4 x i32> undef, <4 x i32> zeroinitializer
2091  %res1 = bitcast <4 x i32> %res0 to <2 x i64>
2092  ret <2 x i64> %res1
2093}
2094
2095define <2 x i64> @test_mm_mask_broadcastd_epi32(<2 x i64> %__O, i8 zeroext %__M, <2 x i64> %__A) {
2096; X86-LABEL: test_mm_mask_broadcastd_epi32:
2097; X86:       # %bb.0: # %entry
2098; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
2099; X86-NEXT:    kmovw %eax, %k1
2100; X86-NEXT:    vpbroadcastd %xmm1, %xmm0 {%k1}
2101; X86-NEXT:    retl
2102;
2103; X64-LABEL: test_mm_mask_broadcastd_epi32:
2104; X64:       # %bb.0: # %entry
2105; X64-NEXT:    kmovw %edi, %k1
2106; X64-NEXT:    vpbroadcastd %xmm1, %xmm0 {%k1}
2107; X64-NEXT:    retq
2108entry:
2109  %0 = bitcast <2 x i64> %__A to <4 x i32>
2110  %shuffle.i.i = shufflevector <4 x i32> %0, <4 x i32> undef, <4 x i32> zeroinitializer
2111  %1 = bitcast <2 x i64> %__O to <4 x i32>
2112  %2 = bitcast i8 %__M to <8 x i1>
2113  %extract.i = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2114  %3 = select <4 x i1> %extract.i, <4 x i32> %shuffle.i.i, <4 x i32> %1
2115  %4 = bitcast <4 x i32> %3 to <2 x i64>
2116  ret <2 x i64> %4
2117}
2118
2119define <2 x i64> @test_mm_maskz_broadcastd_epi32(i8 zeroext %__M, <2 x i64> %__A) {
2120; X86-LABEL: test_mm_maskz_broadcastd_epi32:
2121; X86:       # %bb.0: # %entry
2122; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
2123; X86-NEXT:    kmovw %eax, %k1
2124; X86-NEXT:    vpbroadcastd %xmm0, %xmm0 {%k1} {z}
2125; X86-NEXT:    retl
2126;
2127; X64-LABEL: test_mm_maskz_broadcastd_epi32:
2128; X64:       # %bb.0: # %entry
2129; X64-NEXT:    kmovw %edi, %k1
2130; X64-NEXT:    vpbroadcastd %xmm0, %xmm0 {%k1} {z}
2131; X64-NEXT:    retq
2132entry:
2133  %0 = bitcast <2 x i64> %__A to <4 x i32>
2134  %shuffle.i.i = shufflevector <4 x i32> %0, <4 x i32> undef, <4 x i32> zeroinitializer
2135  %1 = bitcast i8 %__M to <8 x i1>
2136  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2137  %2 = select <4 x i1> %extract.i, <4 x i32> %shuffle.i.i, <4 x i32> zeroinitializer
2138  %3 = bitcast <4 x i32> %2 to <2 x i64>
2139  ret <2 x i64> %3
2140}
2141
2142define <4 x i64> @test_mm256_broadcastd_epi32(<2 x i64> %a0) {
2143; CHECK-LABEL: test_mm256_broadcastd_epi32:
2144; CHECK:       # %bb.0:
2145; CHECK-NEXT:    vbroadcastss %xmm0, %ymm0
2146; CHECK-NEXT:    ret{{[l|q]}}
2147  %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
2148  %res0 = shufflevector <4 x i32> %arg0, <4 x i32> undef, <8 x i32> zeroinitializer
2149  %res1 = bitcast <8 x i32> %res0 to <4 x i64>
2150  ret <4 x i64> %res1
2151}
2152
2153define <4 x i64> @test_mm256_mask_broadcastd_epi32(<4 x i64> %a0, i8 %a1, <2 x i64> %a2) {
2154; X86-LABEL: test_mm256_mask_broadcastd_epi32:
2155; X86:       # %bb.0:
2156; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
2157; X86-NEXT:    kmovw %eax, %k1
2158; X86-NEXT:    vpbroadcastd %xmm1, %ymm0 {%k1}
2159; X86-NEXT:    retl
2160;
2161; X64-LABEL: test_mm256_mask_broadcastd_epi32:
2162; X64:       # %bb.0:
2163; X64-NEXT:    kmovw %edi, %k1
2164; X64-NEXT:    vpbroadcastd %xmm1, %ymm0 {%k1}
2165; X64-NEXT:    retq
2166  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2167  %arg1 = bitcast i8 %a1 to <8 x i1>
2168  %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
2169  %res0 = shufflevector <4 x i32> %arg2, <4 x i32> undef, <8 x i32> zeroinitializer
2170  %res1 = select <8 x i1> %arg1, <8 x i32> %res0, <8 x i32> %arg0
2171  %res2 = bitcast <8 x i32> %res1 to <4 x i64>
2172  ret <4 x i64> %res2
2173}
2174
2175define <4 x i64> @test_mm256_maskz_broadcastd_epi32(i8 %a0, <2 x i64> %a1) {
2176; X86-LABEL: test_mm256_maskz_broadcastd_epi32:
2177; X86:       # %bb.0:
2178; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
2179; X86-NEXT:    kmovw %eax, %k1
2180; X86-NEXT:    vpbroadcastd %xmm0, %ymm0 {%k1} {z}
2181; X86-NEXT:    retl
2182;
2183; X64-LABEL: test_mm256_maskz_broadcastd_epi32:
2184; X64:       # %bb.0:
2185; X64-NEXT:    kmovw %edi, %k1
2186; X64-NEXT:    vpbroadcastd %xmm0, %ymm0 {%k1} {z}
2187; X64-NEXT:    retq
2188  %arg0 = bitcast i8 %a0 to <8 x i1>
2189  %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
2190  %res0 = shufflevector <4 x i32> %arg1, <4 x i32> undef, <8 x i32> zeroinitializer
2191  %res1 = select <8 x i1> %arg0, <8 x i32> %res0, <8 x i32> zeroinitializer
2192  %res2 = bitcast <8 x i32> %res1 to <4 x i64>
2193  ret <4 x i64> %res2
2194}
2195
2196define <2 x i64> @test_mm_broadcastq_epi64(<2 x i64> %a0) {
2197; CHECK-LABEL: test_mm_broadcastq_epi64:
2198; CHECK:       # %bb.0:
2199; CHECK-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
2200; CHECK-NEXT:    ret{{[l|q]}}
2201  %res = shufflevector <2 x i64> %a0, <2 x i64> undef, <2 x i32> zeroinitializer
2202  ret <2 x i64> %res
2203}
2204
2205define <2 x i64> @test_mm_mask_broadcastq_epi64(<2 x i64> %__O, i8 zeroext %__M, <2 x i64> %__A) {
2206; X86-LABEL: test_mm_mask_broadcastq_epi64:
2207; X86:       # %bb.0: # %entry
2208; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
2209; X86-NEXT:    kmovw %eax, %k1
2210; X86-NEXT:    vpbroadcastq %xmm1, %xmm0 {%k1}
2211; X86-NEXT:    retl
2212;
2213; X64-LABEL: test_mm_mask_broadcastq_epi64:
2214; X64:       # %bb.0: # %entry
2215; X64-NEXT:    kmovw %edi, %k1
2216; X64-NEXT:    vpbroadcastq %xmm1, %xmm0 {%k1}
2217; X64-NEXT:    retq
2218entry:
2219  %shuffle.i.i = shufflevector <2 x i64> %__A, <2 x i64> undef, <2 x i32> zeroinitializer
2220  %0 = bitcast i8 %__M to <8 x i1>
2221  %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
2222  %1 = select <2 x i1> %extract.i, <2 x i64> %shuffle.i.i, <2 x i64> %__O
2223  ret <2 x i64> %1
2224}
2225
2226define <2 x i64> @test_mm_maskz_broadcastq_epi64(i8 zeroext %__M, <2 x i64> %__A) {
2227; X86-LABEL: test_mm_maskz_broadcastq_epi64:
2228; X86:       # %bb.0: # %entry
2229; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
2230; X86-NEXT:    kmovw %eax, %k1
2231; X86-NEXT:    vpbroadcastq %xmm0, %xmm0 {%k1} {z}
2232; X86-NEXT:    retl
2233;
2234; X64-LABEL: test_mm_maskz_broadcastq_epi64:
2235; X64:       # %bb.0: # %entry
2236; X64-NEXT:    kmovw %edi, %k1
2237; X64-NEXT:    vpbroadcastq %xmm0, %xmm0 {%k1} {z}
2238; X64-NEXT:    retq
2239entry:
2240  %shuffle.i.i = shufflevector <2 x i64> %__A, <2 x i64> undef, <2 x i32> zeroinitializer
2241  %0 = bitcast i8 %__M to <8 x i1>
2242  %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
2243  %1 = select <2 x i1> %extract.i, <2 x i64> %shuffle.i.i, <2 x i64> zeroinitializer
2244  ret <2 x i64> %1
2245}
2246
2247define <4 x i64> @test_mm256_broadcastq_epi64(<2 x i64> %a0) {
2248; CHECK-LABEL: test_mm256_broadcastq_epi64:
2249; CHECK:       # %bb.0:
2250; CHECK-NEXT:    vbroadcastsd %xmm0, %ymm0
2251; CHECK-NEXT:    ret{{[l|q]}}
2252  %res = shufflevector <2 x i64> %a0, <2 x i64> undef, <4 x i32> zeroinitializer
2253  ret <4 x i64> %res
2254}
2255
2256define <4 x i64> @test_mm256_mask_broadcastq_epi64(<4 x i64> %__O, i8 zeroext %__M, <2 x i64> %__A) {
2257; X86-LABEL: test_mm256_mask_broadcastq_epi64:
2258; X86:       # %bb.0: # %entry
2259; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
2260; X86-NEXT:    kmovw %eax, %k1
2261; X86-NEXT:    vpbroadcastq %xmm1, %ymm0 {%k1}
2262; X86-NEXT:    retl
2263;
2264; X64-LABEL: test_mm256_mask_broadcastq_epi64:
2265; X64:       # %bb.0: # %entry
2266; X64-NEXT:    kmovw %edi, %k1
2267; X64-NEXT:    vpbroadcastq %xmm1, %ymm0 {%k1}
2268; X64-NEXT:    retq
2269entry:
2270  %shuffle.i.i = shufflevector <2 x i64> %__A, <2 x i64> undef, <4 x i32> zeroinitializer
2271  %0 = bitcast i8 %__M to <8 x i1>
2272  %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2273  %1 = select <4 x i1> %extract.i, <4 x i64> %shuffle.i.i, <4 x i64> %__O
2274  ret <4 x i64> %1
2275}
2276
2277define <4 x i64> @test_mm256_maskz_broadcastq_epi64(i8 zeroext %__M, <2 x i64> %__A) {
2278; X86-LABEL: test_mm256_maskz_broadcastq_epi64:
2279; X86:       # %bb.0: # %entry
2280; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
2281; X86-NEXT:    kmovw %eax, %k1
2282; X86-NEXT:    vpbroadcastq %xmm0, %ymm0 {%k1} {z}
2283; X86-NEXT:    retl
2284;
2285; X64-LABEL: test_mm256_maskz_broadcastq_epi64:
2286; X64:       # %bb.0: # %entry
2287; X64-NEXT:    kmovw %edi, %k1
2288; X64-NEXT:    vpbroadcastq %xmm0, %ymm0 {%k1} {z}
2289; X64-NEXT:    retq
2290entry:
2291  %shuffle.i.i = shufflevector <2 x i64> %__A, <2 x i64> undef, <4 x i32> zeroinitializer
2292  %0 = bitcast i8 %__M to <8 x i1>
2293  %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2294  %1 = select <4 x i1> %extract.i, <4 x i64> %shuffle.i.i, <4 x i64> zeroinitializer
2295  ret <4 x i64> %1
2296}
2297
2298define <4 x double> @test_mm256_broadcastsd_pd(<2 x double> %a0) {
2299; CHECK-LABEL: test_mm256_broadcastsd_pd:
2300; CHECK:       # %bb.0:
2301; CHECK-NEXT:    vbroadcastsd %xmm0, %ymm0
2302; CHECK-NEXT:    ret{{[l|q]}}
2303  %res = shufflevector <2 x double> %a0, <2 x double> undef, <4 x i32> zeroinitializer
2304  ret <4 x double> %res
2305}
2306
2307define <4 x double> @test_mm256_mask_broadcastsd_pd(<4 x double> %__O, i8 zeroext %__M, <2 x double> %__A) {
2308; X86-LABEL: test_mm256_mask_broadcastsd_pd:
2309; X86:       # %bb.0: # %entry
2310; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
2311; X86-NEXT:    kmovw %eax, %k1
2312; X86-NEXT:    vbroadcastsd %xmm1, %ymm0 {%k1}
2313; X86-NEXT:    retl
2314;
2315; X64-LABEL: test_mm256_mask_broadcastsd_pd:
2316; X64:       # %bb.0: # %entry
2317; X64-NEXT:    kmovw %edi, %k1
2318; X64-NEXT:    vbroadcastsd %xmm1, %ymm0 {%k1}
2319; X64-NEXT:    retq
2320entry:
2321  %shuffle.i.i = shufflevector <2 x double> %__A, <2 x double> undef, <4 x i32> zeroinitializer
2322  %0 = bitcast i8 %__M to <8 x i1>
2323  %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2324  %1 = select <4 x i1> %extract.i, <4 x double> %shuffle.i.i, <4 x double> %__O
2325  ret <4 x double> %1
2326}
2327
2328define <4 x double> @test_mm256_maskz_broadcastsd_pd(i8 zeroext %__M, <2 x double> %__A) {
2329; X86-LABEL: test_mm256_maskz_broadcastsd_pd:
2330; X86:       # %bb.0: # %entry
2331; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
2332; X86-NEXT:    kmovw %eax, %k1
2333; X86-NEXT:    vbroadcastsd %xmm0, %ymm0 {%k1} {z}
2334; X86-NEXT:    retl
2335;
2336; X64-LABEL: test_mm256_maskz_broadcastsd_pd:
2337; X64:       # %bb.0: # %entry
2338; X64-NEXT:    kmovw %edi, %k1
2339; X64-NEXT:    vbroadcastsd %xmm0, %ymm0 {%k1} {z}
2340; X64-NEXT:    retq
2341entry:
2342  %shuffle.i.i = shufflevector <2 x double> %__A, <2 x double> undef, <4 x i32> zeroinitializer
2343  %0 = bitcast i8 %__M to <8 x i1>
2344  %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2345  %1 = select <4 x i1> %extract.i, <4 x double> %shuffle.i.i, <4 x double> zeroinitializer
2346  ret <4 x double> %1
2347}
2348
2349define <4 x float> @test_mm_broadcastss_ps(<4 x float> %a0) {
2350; CHECK-LABEL: test_mm_broadcastss_ps:
2351; CHECK:       # %bb.0:
2352; CHECK-NEXT:    vbroadcastss %xmm0, %xmm0
2353; CHECK-NEXT:    ret{{[l|q]}}
2354  %res = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> zeroinitializer
2355  ret <4 x float> %res
2356}
2357
2358define <4 x float> @test_mm_mask_broadcastss_ps(<4 x float> %__O, i8 zeroext %__M, <4 x float> %__A) {
2359; X86-LABEL: test_mm_mask_broadcastss_ps:
2360; X86:       # %bb.0: # %entry
2361; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
2362; X86-NEXT:    kmovw %eax, %k1
2363; X86-NEXT:    vbroadcastss %xmm1, %xmm0 {%k1}
2364; X86-NEXT:    retl
2365;
2366; X64-LABEL: test_mm_mask_broadcastss_ps:
2367; X64:       # %bb.0: # %entry
2368; X64-NEXT:    kmovw %edi, %k1
2369; X64-NEXT:    vbroadcastss %xmm1, %xmm0 {%k1}
2370; X64-NEXT:    retq
2371entry:
2372  %shuffle.i.i = shufflevector <4 x float> %__A, <4 x float> undef, <4 x i32> zeroinitializer
2373  %0 = bitcast i8 %__M to <8 x i1>
2374  %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2375  %1 = select <4 x i1> %extract.i, <4 x float> %shuffle.i.i, <4 x float> %__O
2376  ret <4 x float> %1
2377}
2378
2379define <4 x float> @test_mm_maskz_broadcastss_ps(i8 zeroext %__M, <4 x float> %__A) {
2380; X86-LABEL: test_mm_maskz_broadcastss_ps:
2381; X86:       # %bb.0: # %entry
2382; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
2383; X86-NEXT:    kmovw %eax, %k1
2384; X86-NEXT:    vbroadcastss %xmm0, %xmm0 {%k1} {z}
2385; X86-NEXT:    retl
2386;
2387; X64-LABEL: test_mm_maskz_broadcastss_ps:
2388; X64:       # %bb.0: # %entry
2389; X64-NEXT:    kmovw %edi, %k1
2390; X64-NEXT:    vbroadcastss %xmm0, %xmm0 {%k1} {z}
2391; X64-NEXT:    retq
2392entry:
2393  %shuffle.i.i = shufflevector <4 x float> %__A, <4 x float> undef, <4 x i32> zeroinitializer
2394  %0 = bitcast i8 %__M to <8 x i1>
2395  %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2396  %1 = select <4 x i1> %extract.i, <4 x float> %shuffle.i.i, <4 x float> zeroinitializer
2397  ret <4 x float> %1
2398}
2399
2400define <8 x float> @test_mm256_broadcastss_ps(<4 x float> %a0) {
2401; CHECK-LABEL: test_mm256_broadcastss_ps:
2402; CHECK:       # %bb.0:
2403; CHECK-NEXT:    vbroadcastss %xmm0, %ymm0
2404; CHECK-NEXT:    ret{{[l|q]}}
2405  %res = shufflevector <4 x float> %a0, <4 x float> undef, <8 x i32> zeroinitializer
2406  ret <8 x float> %res
2407}
2408
2409define <8 x float> @test_mm256_mask_broadcastss_ps(<8 x float> %a0, i8 %a1, <4 x float> %a2) {
2410; X86-LABEL: test_mm256_mask_broadcastss_ps:
2411; X86:       # %bb.0:
2412; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
2413; X86-NEXT:    kmovw %eax, %k1
2414; X86-NEXT:    vbroadcastss %xmm1, %ymm0 {%k1}
2415; X86-NEXT:    retl
2416;
2417; X64-LABEL: test_mm256_mask_broadcastss_ps:
2418; X64:       # %bb.0:
2419; X64-NEXT:    kmovw %edi, %k1
2420; X64-NEXT:    vbroadcastss %xmm1, %ymm0 {%k1}
2421; X64-NEXT:    retq
2422  %arg1 = bitcast i8 %a1 to <8 x i1>
2423  %res0 = shufflevector <4 x float> %a2, <4 x float> undef, <8 x i32> zeroinitializer
2424  %res1 = select <8 x i1> %arg1, <8 x float> %res0, <8 x float> %a0
2425  ret <8 x float> %res1
2426}
2427
2428define <8 x float> @test_mm256_maskz_broadcastss_ps(i8 %a0, <4 x float> %a1) {
2429; X86-LABEL: test_mm256_maskz_broadcastss_ps:
2430; X86:       # %bb.0:
2431; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
2432; X86-NEXT:    kmovw %eax, %k1
2433; X86-NEXT:    vbroadcastss %xmm0, %ymm0 {%k1} {z}
2434; X86-NEXT:    retl
2435;
2436; X64-LABEL: test_mm256_maskz_broadcastss_ps:
2437; X64:       # %bb.0:
2438; X64-NEXT:    kmovw %edi, %k1
2439; X64-NEXT:    vbroadcastss %xmm0, %ymm0 {%k1} {z}
2440; X64-NEXT:    retq
2441  %arg0 = bitcast i8 %a0 to <8 x i1>
2442  %res0 = shufflevector <4 x float> %a1, <4 x float> undef, <8 x i32> zeroinitializer
2443  %res1 = select <8 x i1> %arg0, <8 x float> %res0, <8 x float> zeroinitializer
2444  ret <8 x float> %res1
2445}
2446
2447define <2 x double> @test_mm_movddup_pd(<2 x double> %a0) {
2448; CHECK-LABEL: test_mm_movddup_pd:
2449; CHECK:       # %bb.0:
2450; CHECK-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
2451; CHECK-NEXT:    ret{{[l|q]}}
2452  %res = shufflevector <2 x double> %a0, <2 x double> undef, <2 x i32> zeroinitializer
2453  ret <2 x double> %res
2454}
2455
2456define <2 x double> @test_mm_mask_movedup_pd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A) {
2457; X86-LABEL: test_mm_mask_movedup_pd:
2458; X86:       # %bb.0: # %entry
2459; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
2460; X86-NEXT:    kmovw %eax, %k1
2461; X86-NEXT:    vmovddup {{.*#+}} xmm0 {%k1} = xmm1[0,0]
2462; X86-NEXT:    retl
2463;
2464; X64-LABEL: test_mm_mask_movedup_pd:
2465; X64:       # %bb.0: # %entry
2466; X64-NEXT:    kmovw %edi, %k1
2467; X64-NEXT:    vmovddup {{.*#+}} xmm0 {%k1} = xmm1[0,0]
2468; X64-NEXT:    retq
2469entry:
2470  %shuffle.i.i = shufflevector <2 x double> %__A, <2 x double> undef, <2 x i32> zeroinitializer
2471  %0 = bitcast i8 %__U to <8 x i1>
2472  %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
2473  %1 = select <2 x i1> %extract.i, <2 x double> %shuffle.i.i, <2 x double> %__W
2474  ret <2 x double> %1
2475}
2476
2477define <2 x double> @test_mm_maskz_movedup_pd(i8 zeroext %__U, <2 x double> %__A) {
2478; X86-LABEL: test_mm_maskz_movedup_pd:
2479; X86:       # %bb.0: # %entry
2480; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
2481; X86-NEXT:    kmovw %eax, %k1
2482; X86-NEXT:    vmovddup {{.*#+}} xmm0 {%k1} {z} = xmm0[0,0]
2483; X86-NEXT:    retl
2484;
2485; X64-LABEL: test_mm_maskz_movedup_pd:
2486; X64:       # %bb.0: # %entry
2487; X64-NEXT:    kmovw %edi, %k1
2488; X64-NEXT:    vmovddup {{.*#+}} xmm0 {%k1} {z} = xmm0[0,0]
2489; X64-NEXT:    retq
2490entry:
2491  %shuffle.i.i = shufflevector <2 x double> %__A, <2 x double> undef, <2 x i32> zeroinitializer
2492  %0 = bitcast i8 %__U to <8 x i1>
2493  %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
2494  %1 = select <2 x i1> %extract.i, <2 x double> %shuffle.i.i, <2 x double> zeroinitializer
2495  ret <2 x double> %1
2496}
2497
2498define <4 x double> @test_mm256_movddup_pd(<4 x double> %a0) {
2499; CHECK-LABEL: test_mm256_movddup_pd:
2500; CHECK:       # %bb.0:
2501; CHECK-NEXT:    vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
2502; CHECK-NEXT:    ret{{[l|q]}}
2503  %res = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
2504  ret <4 x double> %res
2505}
2506
2507define <4 x double> @test_mm256_mask_movedup_pd(<4 x double> %__W, i8 zeroext %__U, <4 x double> %__A) {
2508; X86-LABEL: test_mm256_mask_movedup_pd:
2509; X86:       # %bb.0: # %entry
2510; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
2511; X86-NEXT:    kmovw %eax, %k1
2512; X86-NEXT:    vmovddup {{.*#+}} ymm0 {%k1} = ymm1[0,0,2,2]
2513; X86-NEXT:    retl
2514;
2515; X64-LABEL: test_mm256_mask_movedup_pd:
2516; X64:       # %bb.0: # %entry
2517; X64-NEXT:    kmovw %edi, %k1
2518; X64-NEXT:    vmovddup {{.*#+}} ymm0 {%k1} = ymm1[0,0,2,2]
2519; X64-NEXT:    retq
2520entry:
2521  %shuffle.i.i = shufflevector <4 x double> %__A, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
2522  %0 = bitcast i8 %__U to <8 x i1>
2523  %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2524  %1 = select <4 x i1> %extract.i, <4 x double> %shuffle.i.i, <4 x double> %__W
2525  ret <4 x double> %1
2526}
2527
2528define <4 x double> @test_mm256_maskz_movedup_pd(i8 zeroext %__U, <4 x double> %__A) {
2529; X86-LABEL: test_mm256_maskz_movedup_pd:
2530; X86:       # %bb.0: # %entry
2531; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
2532; X86-NEXT:    kmovw %eax, %k1
2533; X86-NEXT:    vmovddup {{.*#+}} ymm0 {%k1} {z} = ymm0[0,0,2,2]
2534; X86-NEXT:    retl
2535;
2536; X64-LABEL: test_mm256_maskz_movedup_pd:
2537; X64:       # %bb.0: # %entry
2538; X64-NEXT:    kmovw %edi, %k1
2539; X64-NEXT:    vmovddup {{.*#+}} ymm0 {%k1} {z} = ymm0[0,0,2,2]
2540; X64-NEXT:    retq
2541entry:
2542  %shuffle.i.i = shufflevector <4 x double> %__A, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
2543  %0 = bitcast i8 %__U to <8 x i1>
2544  %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2545  %1 = select <4 x i1> %extract.i, <4 x double> %shuffle.i.i, <4 x double> zeroinitializer
2546  ret <4 x double> %1
2547}
2548
2549define <4 x float> @test_mm_movehdup_ps(<4 x float> %a0) {
2550; CHECK-LABEL: test_mm_movehdup_ps:
2551; CHECK:       # %bb.0:
2552; CHECK-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
2553; CHECK-NEXT:    ret{{[l|q]}}
2554  %res = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
2555  ret <4 x float> %res
2556}
2557
2558define <4 x float> @test_mm_mask_movehdup_ps(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A) {
2559; X86-LABEL: test_mm_mask_movehdup_ps:
2560; X86:       # %bb.0: # %entry
2561; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
2562; X86-NEXT:    kmovw %eax, %k1
2563; X86-NEXT:    vmovshdup {{.*#+}} xmm0 {%k1} = xmm1[1,1,3,3]
2564; X86-NEXT:    retl
2565;
2566; X64-LABEL: test_mm_mask_movehdup_ps:
2567; X64:       # %bb.0: # %entry
2568; X64-NEXT:    kmovw %edi, %k1
2569; X64-NEXT:    vmovshdup {{.*#+}} xmm0 {%k1} = xmm1[1,1,3,3]
2570; X64-NEXT:    retq
2571entry:
2572  %shuffle.i.i = shufflevector <4 x float> %__A, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
2573  %0 = bitcast i8 %__U to <8 x i1>
2574  %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2575  %1 = select <4 x i1> %extract.i, <4 x float> %shuffle.i.i, <4 x float> %__W
2576  ret <4 x float> %1
2577}
2578
2579define <4 x float> @test_mm_maskz_movehdup_ps(i8 zeroext %__U, <4 x float> %__A) {
2580; X86-LABEL: test_mm_maskz_movehdup_ps:
2581; X86:       # %bb.0: # %entry
2582; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
2583; X86-NEXT:    kmovw %eax, %k1
2584; X86-NEXT:    vmovshdup {{.*#+}} xmm0 {%k1} {z} = xmm0[1,1,3,3]
2585; X86-NEXT:    retl
2586;
2587; X64-LABEL: test_mm_maskz_movehdup_ps:
2588; X64:       # %bb.0: # %entry
2589; X64-NEXT:    kmovw %edi, %k1
2590; X64-NEXT:    vmovshdup {{.*#+}} xmm0 {%k1} {z} = xmm0[1,1,3,3]
2591; X64-NEXT:    retq
2592entry:
2593  %shuffle.i.i = shufflevector <4 x float> %__A, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
2594  %0 = bitcast i8 %__U to <8 x i1>
2595  %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2596  %1 = select <4 x i1> %extract.i, <4 x float> %shuffle.i.i, <4 x float> zeroinitializer
2597  ret <4 x float> %1
2598}
2599
2600define <8 x float> @test_mm256_movehdup_ps(<8 x float> %a0) {
2601; CHECK-LABEL: test_mm256_movehdup_ps:
2602; CHECK:       # %bb.0:
2603; CHECK-NEXT:    vmovshdup {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7]
2604; CHECK-NEXT:    ret{{[l|q]}}
2605  %res = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
2606  ret <8 x float> %res
2607}
2608
2609define <8 x float> @test_mm256_mask_movehdup_ps(<8 x float> %a0, i8 %a1, <8 x float> %a2) {
2610; X86-LABEL: test_mm256_mask_movehdup_ps:
2611; X86:       # %bb.0:
2612; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
2613; X86-NEXT:    kmovw %eax, %k1
2614; X86-NEXT:    vmovshdup {{.*#+}} ymm0 {%k1} = ymm1[1,1,3,3,5,5,7,7]
2615; X86-NEXT:    retl
2616;
2617; X64-LABEL: test_mm256_mask_movehdup_ps:
2618; X64:       # %bb.0:
2619; X64-NEXT:    kmovw %edi, %k1
2620; X64-NEXT:    vmovshdup {{.*#+}} ymm0 {%k1} = ymm1[1,1,3,3,5,5,7,7]
2621; X64-NEXT:    retq
2622  %arg1 = bitcast i8 %a1 to <8 x i1>
2623  %res0 = shufflevector <8 x float> %a2, <8 x float> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
2624  %res1 = select <8 x i1> %arg1, <8 x float> %res0, <8 x float> %a0
2625  ret <8 x float> %res1
2626}
2627
2628define <8 x float> @test_mm256_maskz_movehdup_ps(i8 %a0, <8 x float> %a1) {
2629; X86-LABEL: test_mm256_maskz_movehdup_ps:
2630; X86:       # %bb.0:
2631; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
2632; X86-NEXT:    kmovw %eax, %k1
2633; X86-NEXT:    vmovshdup {{.*#+}} ymm0 {%k1} {z} = ymm0[1,1,3,3,5,5,7,7]
2634; X86-NEXT:    retl
2635;
2636; X64-LABEL: test_mm256_maskz_movehdup_ps:
2637; X64:       # %bb.0:
2638; X64-NEXT:    kmovw %edi, %k1
2639; X64-NEXT:    vmovshdup {{.*#+}} ymm0 {%k1} {z} = ymm0[1,1,3,3,5,5,7,7]
2640; X64-NEXT:    retq
2641  %arg0 = bitcast i8 %a0 to <8 x i1>
2642  %res0 = shufflevector <8 x float> %a1, <8 x float> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
2643  %res1 = select <8 x i1> %arg0, <8 x float> %res0, <8 x float> zeroinitializer
2644  ret <8 x float> %res1
2645}
2646
2647define <4 x float> @test_mm_moveldup_ps(<4 x float> %a0) {
2648; CHECK-LABEL: test_mm_moveldup_ps:
2649; CHECK:       # %bb.0:
2650; CHECK-NEXT:    vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2]
2651; CHECK-NEXT:    ret{{[l|q]}}
2652  %res = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
2653  ret <4 x float> %res
2654}
2655
2656define <4 x float> @test_mm_mask_moveldup_ps(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A) {
2657; X86-LABEL: test_mm_mask_moveldup_ps:
2658; X86:       # %bb.0: # %entry
2659; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
2660; X86-NEXT:    kmovw %eax, %k1
2661; X86-NEXT:    vmovsldup {{.*#+}} xmm0 {%k1} = xmm1[0,0,2,2]
2662; X86-NEXT:    retl
2663;
2664; X64-LABEL: test_mm_mask_moveldup_ps:
2665; X64:       # %bb.0: # %entry
2666; X64-NEXT:    kmovw %edi, %k1
2667; X64-NEXT:    vmovsldup {{.*#+}} xmm0 {%k1} = xmm1[0,0,2,2]
2668; X64-NEXT:    retq
2669entry:
2670  %shuffle.i.i = shufflevector <4 x float> %__A, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
2671  %0 = bitcast i8 %__U to <8 x i1>
2672  %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2673  %1 = select <4 x i1> %extract.i, <4 x float> %shuffle.i.i, <4 x float> %__W
2674  ret <4 x float> %1
2675}
2676
2677define <4 x float> @test_mm_maskz_moveldup_ps(i8 zeroext %__U, <4 x float> %__A) {
2678; X86-LABEL: test_mm_maskz_moveldup_ps:
2679; X86:       # %bb.0: # %entry
2680; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
2681; X86-NEXT:    kmovw %eax, %k1
2682; X86-NEXT:    vmovsldup {{.*#+}} xmm0 {%k1} {z} = xmm0[0,0,2,2]
2683; X86-NEXT:    retl
2684;
2685; X64-LABEL: test_mm_maskz_moveldup_ps:
2686; X64:       # %bb.0: # %entry
2687; X64-NEXT:    kmovw %edi, %k1
2688; X64-NEXT:    vmovsldup {{.*#+}} xmm0 {%k1} {z} = xmm0[0,0,2,2]
2689; X64-NEXT:    retq
2690entry:
2691  %shuffle.i.i = shufflevector <4 x float> %__A, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
2692  %0 = bitcast i8 %__U to <8 x i1>
2693  %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2694  %1 = select <4 x i1> %extract.i, <4 x float> %shuffle.i.i, <4 x float> zeroinitializer
2695  ret <4 x float> %1
2696}
2697
2698define <8 x float> @test_mm256_moveldup_ps(<8 x float> %a0) {
2699; CHECK-LABEL: test_mm256_moveldup_ps:
2700; CHECK:       # %bb.0:
2701; CHECK-NEXT:    vmovsldup {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6]
2702; CHECK-NEXT:    ret{{[l|q]}}
2703  %res = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
2704  ret <8 x float> %res
2705}
2706
2707define <8 x float> @test_mm256_mask_moveldup_ps(<8 x float> %a0, i8 %a1, <8 x float> %a2) {
2708; X86-LABEL: test_mm256_mask_moveldup_ps:
2709; X86:       # %bb.0:
2710; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
2711; X86-NEXT:    kmovw %eax, %k1
2712; X86-NEXT:    vmovsldup {{.*#+}} ymm0 {%k1} = ymm1[0,0,2,2,4,4,6,6]
2713; X86-NEXT:    retl
2714;
2715; X64-LABEL: test_mm256_mask_moveldup_ps:
2716; X64:       # %bb.0:
2717; X64-NEXT:    kmovw %edi, %k1
2718; X64-NEXT:    vmovsldup {{.*#+}} ymm0 {%k1} = ymm1[0,0,2,2,4,4,6,6]
2719; X64-NEXT:    retq
2720  %arg1 = bitcast i8 %a1 to <8 x i1>
2721  %res0 = shufflevector <8 x float> %a2, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
2722  %res1 = select <8 x i1> %arg1, <8 x float> %res0, <8 x float> %a0
2723  ret <8 x float> %res1
2724}
2725
2726define <8 x float> @test_mm256_maskz_moveldup_ps(i8 %a0, <8 x float> %a1) {
2727; X86-LABEL: test_mm256_maskz_moveldup_ps:
2728; X86:       # %bb.0:
2729; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
2730; X86-NEXT:    kmovw %eax, %k1
2731; X86-NEXT:    vmovsldup {{.*#+}} ymm0 {%k1} {z} = ymm0[0,0,2,2,4,4,6,6]
2732; X86-NEXT:    retl
2733;
2734; X64-LABEL: test_mm256_maskz_moveldup_ps:
2735; X64:       # %bb.0:
2736; X64-NEXT:    kmovw %edi, %k1
2737; X64-NEXT:    vmovsldup {{.*#+}} ymm0 {%k1} {z} = ymm0[0,0,2,2,4,4,6,6]
2738; X64-NEXT:    retq
2739  %arg0 = bitcast i8 %a0 to <8 x i1>
2740  %res0 = shufflevector <8 x float> %a1, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
2741  %res1 = select <8 x i1> %arg0, <8 x float> %res0, <8 x float> zeroinitializer
2742  ret <8 x float> %res1
2743}
2744
2745define <4 x i64> @test_mm256_permutex_epi64(<4 x i64> %a0) {
2746; CHECK-LABEL: test_mm256_permutex_epi64:
2747; CHECK:       # %bb.0:
2748; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[3,0,0,0]
2749; CHECK-NEXT:    ret{{[l|q]}}
2750  %res = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> <i32 3, i32 0, i32 0, i32 0>
2751  ret <4 x i64> %res
2752}
2753
2754define <4 x i64> @test_mm256_mask_permutex_epi64(<4 x i64> %__W, i8 zeroext %__M, <4 x i64> %__X) {
2755; X86-LABEL: test_mm256_mask_permutex_epi64:
2756; X86:       # %bb.0: # %entry
2757; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
2758; X86-NEXT:    kmovw %eax, %k1
2759; X86-NEXT:    vpermq {{.*#+}} ymm0 {%k1} = ymm1[3,0,0,0]
2760; X86-NEXT:    retl
2761;
2762; X64-LABEL: test_mm256_mask_permutex_epi64:
2763; X64:       # %bb.0: # %entry
2764; X64-NEXT:    kmovw %edi, %k1
2765; X64-NEXT:    vpermq {{.*#+}} ymm0 {%k1} = ymm1[3,0,0,0]
2766; X64-NEXT:    retq
2767entry:
2768  %perm = shufflevector <4 x i64> %__X, <4 x i64> undef, <4 x i32> <i32 3, i32 0, i32 0, i32 0>
2769  %0 = bitcast i8 %__M to <8 x i1>
2770  %extract = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2771  %1 = select <4 x i1> %extract, <4 x i64> %perm, <4 x i64> %__W
2772  ret <4 x i64> %1
2773}
2774
2775define <4 x i64> @test_mm256_maskz_permutex_epi64(i8 zeroext %__M, <4 x i64> %__X) {
2776; X86-LABEL: test_mm256_maskz_permutex_epi64:
2777; X86:       # %bb.0: # %entry
2778; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
2779; X86-NEXT:    kmovw %eax, %k1
2780; X86-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[3,0,0,0]
2781; X86-NEXT:    retl
2782;
2783; X64-LABEL: test_mm256_maskz_permutex_epi64:
2784; X64:       # %bb.0: # %entry
2785; X64-NEXT:    kmovw %edi, %k1
2786; X64-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[3,0,0,0]
2787; X64-NEXT:    retq
2788entry:
2789  %perm = shufflevector <4 x i64> %__X, <4 x i64> undef, <4 x i32> <i32 3, i32 0, i32 0, i32 0>
2790  %0 = bitcast i8 %__M to <8 x i1>
2791  %extract = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2792  %1 = select <4 x i1> %extract, <4 x i64> %perm, <4 x i64> zeroinitializer
2793  ret <4 x i64> %1
2794}
2795
2796define <4 x double> @test_mm256_permutex_pd(<4 x double> %a0) {
2797; CHECK-LABEL: test_mm256_permutex_pd:
2798; CHECK:       # %bb.0:
2799; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[3,0,0,0]
2800; CHECK-NEXT:    ret{{[l|q]}}
2801  %res = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> <i32 3, i32 0, i32 0, i32 0>
2802  ret <4 x double> %res
2803}
2804
2805define <4 x double> @test_mm256_mask_permutex_pd(<4 x double> %__W, i8 zeroext %__U, <4 x double> %__X) {
2806; X86-LABEL: test_mm256_mask_permutex_pd:
2807; X86:       # %bb.0: # %entry
2808; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
2809; X86-NEXT:    kmovw %eax, %k1
2810; X86-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} = ymm1[1,0,0,0]
2811; X86-NEXT:    retl
2812;
2813; X64-LABEL: test_mm256_mask_permutex_pd:
2814; X64:       # %bb.0: # %entry
2815; X64-NEXT:    kmovw %edi, %k1
2816; X64-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} = ymm1[1,0,0,0]
2817; X64-NEXT:    retq
2818entry:
2819  %perm = shufflevector <4 x double> %__X, <4 x double> undef, <4 x i32> <i32 1, i32 0, i32 0, i32 0>
2820  %0 = bitcast i8 %__U to <8 x i1>
2821  %extract = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2822  %1 = select <4 x i1> %extract, <4 x double> %perm, <4 x double> %__W
2823  ret <4 x double> %1
2824}
2825
2826define <4 x double> @test_mm256_maskz_permutex_pd(i8 zeroext %__U, <4 x double> %__X) {
2827; X86-LABEL: test_mm256_maskz_permutex_pd:
2828; X86:       # %bb.0: # %entry
2829; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
2830; X86-NEXT:    kmovw %eax, %k1
2831; X86-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1,0,0,0]
2832; X86-NEXT:    retl
2833;
2834; X64-LABEL: test_mm256_maskz_permutex_pd:
2835; X64:       # %bb.0: # %entry
2836; X64-NEXT:    kmovw %edi, %k1
2837; X64-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1,0,0,0]
2838; X64-NEXT:    retq
2839entry:
2840  %perm = shufflevector <4 x double> %__X, <4 x double> undef, <4 x i32> <i32 1, i32 0, i32 0, i32 0>
2841  %0 = bitcast i8 %__U to <8 x i1>
2842  %extract = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2843  %1 = select <4 x i1> %extract, <4 x double> %perm, <4 x double> zeroinitializer
2844  ret <4 x double> %1
2845}
2846
2847define <2 x double> @test_mm_shuffle_pd(<2 x double> %a0, <2 x double> %a1) {
2848; CHECK-LABEL: test_mm_shuffle_pd:
2849; CHECK:       # %bb.0:
2850; CHECK-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
2851; CHECK-NEXT:    ret{{[l|q]}}
2852  %res = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> <i32 1, i32 3>
2853  ret <2 x double> %res
2854}
2855
2856define <2 x double> @test_mm_mask_shuffle_pd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
2857; X86-LABEL: test_mm_mask_shuffle_pd:
2858; X86:       # %bb.0: # %entry
2859; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
2860; X86-NEXT:    kmovw %eax, %k1
2861; X86-NEXT:    vunpckhpd {{.*#+}} xmm0 {%k1} = xmm1[1],xmm2[1]
2862; X86-NEXT:    retl
2863;
2864; X64-LABEL: test_mm_mask_shuffle_pd:
2865; X64:       # %bb.0: # %entry
2866; X64-NEXT:    kmovw %edi, %k1
2867; X64-NEXT:    vunpckhpd {{.*#+}} xmm0 {%k1} = xmm1[1],xmm2[1]
2868; X64-NEXT:    retq
2869entry:
2870  %shufp = shufflevector <2 x double> %__A, <2 x double> %__B, <2 x i32> <i32 1, i32 3>
2871  %0 = bitcast i8 %__U to <8 x i1>
2872  %extract = shufflevector <8 x i1> %0, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
2873  %1 = select <2 x i1> %extract, <2 x double> %shufp, <2 x double> %__W
2874  ret <2 x double> %1
2875}
2876
2877define <2 x double> @test_mm_maskz_shuffle_pd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
2878; X86-LABEL: test_mm_maskz_shuffle_pd:
2879; X86:       # %bb.0: # %entry
2880; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
2881; X86-NEXT:    kmovw %eax, %k1
2882; X86-NEXT:    vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],xmm1[1]
2883; X86-NEXT:    retl
2884;
2885; X64-LABEL: test_mm_maskz_shuffle_pd:
2886; X64:       # %bb.0: # %entry
2887; X64-NEXT:    kmovw %edi, %k1
2888; X64-NEXT:    vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],xmm1[1]
2889; X64-NEXT:    retq
2890entry:
2891  %shufp = shufflevector <2 x double> %__A, <2 x double> %__B, <2 x i32> <i32 1, i32 3>
2892  %0 = bitcast i8 %__U to <8 x i1>
2893  %extract = shufflevector <8 x i1> %0, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
2894  %1 = select <2 x i1> %extract, <2 x double> %shufp, <2 x double> zeroinitializer
2895  ret <2 x double> %1
2896}
2897
2898define <4 x double> @test_mm256_shuffle_pd(<4 x double> %a0, <4 x double> %a1) {
2899; CHECK-LABEL: test_mm256_shuffle_pd:
2900; CHECK:       # %bb.0:
2901; CHECK-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[2],ymm1[2]
2902; CHECK-NEXT:    ret{{[l|q]}}
2903  %res = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 1, i32 5, i32 2, i32 6>
2904  ret <4 x double> %res
2905}
2906
2907define <4 x double> @test_mm256_mask_shuffle_pd(<4 x double> %__W, i8 zeroext %__U, <4 x double> %__A, <4 x double> %__B) {
2908; X86-LABEL: test_mm256_mask_shuffle_pd:
2909; X86:       # %bb.0: # %entry
2910; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
2911; X86-NEXT:    kmovw %eax, %k1
2912; X86-NEXT:    vshufpd {{.*#+}} ymm0 {%k1} = ymm1[1],ymm2[1],ymm1[2],ymm2[2]
2913; X86-NEXT:    retl
2914;
2915; X64-LABEL: test_mm256_mask_shuffle_pd:
2916; X64:       # %bb.0: # %entry
2917; X64-NEXT:    kmovw %edi, %k1
2918; X64-NEXT:    vshufpd {{.*#+}} ymm0 {%k1} = ymm1[1],ymm2[1],ymm1[2],ymm2[2]
2919; X64-NEXT:    retq
2920entry:
2921  %shufp = shufflevector <4 x double> %__A, <4 x double> %__B, <4 x i32> <i32 1, i32 5, i32 2, i32 6>
2922  %0 = bitcast i8 %__U to <8 x i1>
2923  %extract = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2924  %1 = select <4 x i1> %extract, <4 x double> %shufp, <4 x double> %__W
2925  ret <4 x double> %1
2926}
2927
2928define <4 x double> @test_mm256_maskz_shuffle_pd(i8 zeroext %__U, <4 x double> %__A, <4 x double> %__B) {
2929; X86-LABEL: test_mm256_maskz_shuffle_pd:
2930; X86:       # %bb.0: # %entry
2931; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
2932; X86-NEXT:    kmovw %eax, %k1
2933; X86-NEXT:    vshufpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[2],ymm1[2]
2934; X86-NEXT:    retl
2935;
2936; X64-LABEL: test_mm256_maskz_shuffle_pd:
2937; X64:       # %bb.0: # %entry
2938; X64-NEXT:    kmovw %edi, %k1
2939; X64-NEXT:    vshufpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[2],ymm1[2]
2940; X64-NEXT:    retq
2941entry:
2942  %shufp = shufflevector <4 x double> %__A, <4 x double> %__B, <4 x i32> <i32 1, i32 5, i32 2, i32 6>
2943  %0 = bitcast i8 %__U to <8 x i1>
2944  %extract = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2945  %1 = select <4 x i1> %extract, <4 x double> %shufp, <4 x double> zeroinitializer
2946  ret <4 x double> %1
2947}
2948
2949define <4 x float> @test_mm_shuffle_ps(<4 x float> %a0, <4 x float> %a1) {
2950; CHECK-LABEL: test_mm_shuffle_ps:
2951; CHECK:       # %bb.0:
2952; CHECK-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0]
2953; CHECK-NEXT:    ret{{[l|q]}}
2954  %res = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 0, i32 1, i32 4, i32 4>
2955  ret <4 x float> %res
2956}
2957
2958define <4 x float> @test_mm_mask_shuffle_ps(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
2959; X86-LABEL: test_mm_mask_shuffle_ps:
2960; X86:       # %bb.0: # %entry
2961; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
2962; X86-NEXT:    kmovw %eax, %k1
2963; X86-NEXT:    vshufps {{.*#+}} xmm0 {%k1} = xmm1[0,1],xmm2[0,0]
2964; X86-NEXT:    retl
2965;
2966; X64-LABEL: test_mm_mask_shuffle_ps:
2967; X64:       # %bb.0: # %entry
2968; X64-NEXT:    kmovw %edi, %k1
2969; X64-NEXT:    vshufps {{.*#+}} xmm0 {%k1} = xmm1[0,1],xmm2[0,0]
2970; X64-NEXT:    retq
2971entry:
2972  %shufp = shufflevector <4 x float> %__A, <4 x float> %__B, <4 x i32> <i32 0, i32 1, i32 4, i32 4>
2973  %0 = bitcast i8 %__U to <8 x i1>
2974  %extract = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2975  %1 = select <4 x i1> %extract, <4 x float> %shufp, <4 x float> %__W
2976  ret <4 x float> %1
2977}
2978
2979define <4 x float> @test_mm_maskz_shuffle_ps(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
2980; X86-LABEL: test_mm_maskz_shuffle_ps:
2981; X86:       # %bb.0: # %entry
2982; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
2983; X86-NEXT:    kmovw %eax, %k1
2984; X86-NEXT:    vshufps {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1],xmm1[0,0]
2985; X86-NEXT:    retl
2986;
2987; X64-LABEL: test_mm_maskz_shuffle_ps:
2988; X64:       # %bb.0: # %entry
2989; X64-NEXT:    kmovw %edi, %k1
2990; X64-NEXT:    vshufps {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1],xmm1[0,0]
2991; X64-NEXT:    retq
2992entry:
2993  %shufp = shufflevector <4 x float> %__A, <4 x float> %__B, <4 x i32> <i32 0, i32 1, i32 4, i32 4>
2994  %0 = bitcast i8 %__U to <8 x i1>
2995  %extract = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2996  %1 = select <4 x i1> %extract, <4 x float> %shufp, <4 x float> zeroinitializer
2997  ret <4 x float> %1
2998}
2999
3000define <8 x float> @test_mm256_shuffle_ps(<8 x float> %a0, <8 x float> %a1) {
3001; CHECK-LABEL: test_mm256_shuffle_ps:
3002; CHECK:       # %bb.0:
3003; CHECK-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,0],ymm0[4,5],ymm1[4,4]
3004; CHECK-NEXT:    ret{{[l|q]}}
3005  %res = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 0, i32 1, i32 8, i32 8, i32 4, i32 5, i32 12, i32 12>
3006  ret <8 x float> %res
3007}
3008
3009define <8 x float> @test_mm256_mask_shuffle_ps(<8 x float> %a0, i8 %a1, <8 x float> %a2, <8 x float> %a3) {
3010; X86-LABEL: test_mm256_mask_shuffle_ps:
3011; X86:       # %bb.0:
3012; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
3013; X86-NEXT:    kmovw %eax, %k1
3014; X86-NEXT:    vshufps {{.*#+}} ymm0 {%k1} = ymm1[0,1],ymm2[0,0],ymm1[4,5],ymm2[4,4]
3015; X86-NEXT:    retl
3016;
3017; X64-LABEL: test_mm256_mask_shuffle_ps:
3018; X64:       # %bb.0:
3019; X64-NEXT:    kmovw %edi, %k1
3020; X64-NEXT:    vshufps {{.*#+}} ymm0 {%k1} = ymm1[0,1],ymm2[0,0],ymm1[4,5],ymm2[4,4]
3021; X64-NEXT:    retq
3022  %arg1 = bitcast i8 %a1 to <8 x i1>
3023  %res0 = shufflevector <8 x float> %a2, <8 x float> %a3, <8 x i32> <i32 0, i32 1, i32 8, i32 8, i32 4, i32 5, i32 12, i32 12>
3024  %res1 = select <8 x i1> %arg1, <8 x float> %res0, <8 x float> %a0
3025  ret <8 x float> %res1
3026}
3027
3028define <8 x float> @test_mm256_maskz_shuffle_ps(i8 %a0, <8 x float> %a1, <8 x float> %a2) {
3029; X86-LABEL: test_mm256_maskz_shuffle_ps:
3030; X86:       # %bb.0:
3031; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
3032; X86-NEXT:    kmovw %eax, %k1
3033; X86-NEXT:    vshufps {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1],ymm1[0,0],ymm0[4,5],ymm1[4,4]
3034; X86-NEXT:    retl
3035;
3036; X64-LABEL: test_mm256_maskz_shuffle_ps:
3037; X64:       # %bb.0:
3038; X64-NEXT:    kmovw %edi, %k1
3039; X64-NEXT:    vshufps {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1],ymm1[0,0],ymm0[4,5],ymm1[4,4]
3040; X64-NEXT:    retq
3041  %arg0 = bitcast i8 %a0 to <8 x i1>
3042  %res0 = shufflevector <8 x float> %a1, <8 x float> %a2, <8 x i32> <i32 0, i32 1, i32 8, i32 8, i32 4, i32 5, i32 12, i32 12>
3043  %res1 = select <8 x i1> %arg0, <8 x float> %res0, <8 x float> zeroinitializer
3044  ret <8 x float> %res1
3045}
3046
3047define <4 x i64> @test_mm256_mask_mul_epi32(<4 x i64> %__W, i8 zeroext %__M, <4 x i64> %__X, <4 x i64> %__Y) nounwind {
3048; X86-LABEL: test_mm256_mask_mul_epi32:
3049; X86:       # %bb.0: # %entry
3050; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
3051; X86-NEXT:    kmovw %eax, %k1
3052; X86-NEXT:    vpmuldq %ymm1, %ymm2, %ymm0 {%k1}
3053; X86-NEXT:    retl
3054;
3055; X64-LABEL: test_mm256_mask_mul_epi32:
3056; X64:       # %bb.0: # %entry
3057; X64-NEXT:    kmovw %edi, %k1
3058; X64-NEXT:    vpmuldq %ymm1, %ymm2, %ymm0 {%k1}
3059; X64-NEXT:    retq
3060entry:
3061  %tmp = shl <4 x i64> %__X, <i64 32, i64 32, i64 32, i64 32>
3062  %tmp1 = ashr exact <4 x i64> %tmp, <i64 32, i64 32, i64 32, i64 32>
3063  %tmp2 = shl <4 x i64> %__Y, <i64 32, i64 32, i64 32, i64 32>
3064  %tmp3 = ashr exact <4 x i64> %tmp2, <i64 32, i64 32, i64 32, i64 32>
3065  %tmp4 = mul nsw <4 x i64> %tmp3, %tmp1
3066  %tmp5 = bitcast i8 %__M to <8 x i1>
3067  %extract.i = shufflevector <8 x i1> %tmp5, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
3068  %tmp6 = select <4 x i1> %extract.i, <4 x i64> %tmp4, <4 x i64> %__W
3069  ret <4 x i64> %tmp6
3070}
3071
3072define <4 x i64> @test_mm256_maskz_mul_epi32(i8 zeroext %__M, <4 x i64> %__X, <4 x i64> %__Y) nounwind {
3073; X86-LABEL: test_mm256_maskz_mul_epi32:
3074; X86:       # %bb.0:
3075; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
3076; X86-NEXT:    kmovw %eax, %k1
3077; X86-NEXT:    vpmuldq %ymm0, %ymm1, %ymm0 {%k1} {z}
3078; X86-NEXT:    retl
3079;
3080; X64-LABEL: test_mm256_maskz_mul_epi32:
3081; X64:       # %bb.0:
3082; X64-NEXT:    kmovw %edi, %k1
3083; X64-NEXT:    vpmuldq %ymm0, %ymm1, %ymm0 {%k1} {z}
3084; X64-NEXT:    retq
3085  %tmp = shl <4 x i64> %__X, <i64 32, i64 32, i64 32, i64 32>
3086  %tmp1 = ashr exact <4 x i64> %tmp, <i64 32, i64 32, i64 32, i64 32>
3087  %tmp2 = shl <4 x i64> %__Y, <i64 32, i64 32, i64 32, i64 32>
3088  %tmp3 = ashr exact <4 x i64> %tmp2, <i64 32, i64 32, i64 32, i64 32>
3089  %tmp4 = mul nsw <4 x i64> %tmp3, %tmp1
3090  %tmp5 = bitcast i8 %__M to <8 x i1>
3091  %extract.i = shufflevector <8 x i1> %tmp5, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
3092  %tmp6 = select <4 x i1> %extract.i, <4 x i64> %tmp4, <4 x i64> zeroinitializer
3093  ret <4 x i64> %tmp6
3094}
3095
3096define <2 x i64> @test_mm_mask_mul_epi32(<2 x i64> %__W, i8 zeroext %__M, <2 x i64> %__X, <2 x i64> %__Y) nounwind {
3097; X86-LABEL: test_mm_mask_mul_epi32:
3098; X86:       # %bb.0:
3099; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
3100; X86-NEXT:    kmovw %eax, %k1
3101; X86-NEXT:    vpmuldq %xmm1, %xmm2, %xmm0 {%k1}
3102; X86-NEXT:    retl
3103;
3104; X64-LABEL: test_mm_mask_mul_epi32:
3105; X64:       # %bb.0:
3106; X64-NEXT:    kmovw %edi, %k1
3107; X64-NEXT:    vpmuldq %xmm1, %xmm2, %xmm0 {%k1}
3108; X64-NEXT:    retq
3109  %tmp = shl <2 x i64> %__X, <i64 32, i64 32>
3110  %tmp1 = ashr exact <2 x i64> %tmp, <i64 32, i64 32>
3111  %tmp2 = shl <2 x i64> %__Y, <i64 32, i64 32>
3112  %tmp3 = ashr exact <2 x i64> %tmp2, <i64 32, i64 32>
3113  %tmp4 = mul nsw <2 x i64> %tmp3, %tmp1
3114  %tmp5 = bitcast i8 %__M to <8 x i1>
3115  %extract.i = shufflevector <8 x i1> %tmp5, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
3116  %tmp6 = select <2 x i1> %extract.i, <2 x i64> %tmp4, <2 x i64> %__W
3117  ret <2 x i64> %tmp6
3118}
3119
3120define <2 x i64> @test_mm_maskz_mul_epi32(i8 zeroext %__M, <2 x i64> %__X, <2 x i64> %__Y) nounwind {
3121; X86-LABEL: test_mm_maskz_mul_epi32:
3122; X86:       # %bb.0:
3123; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
3124; X86-NEXT:    kmovw %eax, %k1
3125; X86-NEXT:    vpmuldq %xmm0, %xmm1, %xmm0 {%k1} {z}
3126; X86-NEXT:    retl
3127;
3128; X64-LABEL: test_mm_maskz_mul_epi32:
3129; X64:       # %bb.0:
3130; X64-NEXT:    kmovw %edi, %k1
3131; X64-NEXT:    vpmuldq %xmm0, %xmm1, %xmm0 {%k1} {z}
3132; X64-NEXT:    retq
3133  %tmp = shl <2 x i64> %__X, <i64 32, i64 32>
3134  %tmp1 = ashr exact <2 x i64> %tmp, <i64 32, i64 32>
3135  %tmp2 = shl <2 x i64> %__Y, <i64 32, i64 32>
3136  %tmp3 = ashr exact <2 x i64> %tmp2, <i64 32, i64 32>
3137  %tmp4 = mul nsw <2 x i64> %tmp3, %tmp1
3138  %tmp5 = bitcast i8 %__M to <8 x i1>
3139  %extract.i = shufflevector <8 x i1> %tmp5, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
3140  %tmp6 = select <2 x i1> %extract.i, <2 x i64> %tmp4, <2 x i64> zeroinitializer
3141  ret <2 x i64> %tmp6
3142}
3143
3144define <4 x i64> @test_mm256_mask_mul_epu32(<4 x i64> %__W, i8 zeroext %__M, <4 x i64> %__X, <4 x i64> %__Y) nounwind {
3145; X86-LABEL: test_mm256_mask_mul_epu32:
3146; X86:       # %bb.0: # %entry
3147; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
3148; X86-NEXT:    kmovw %eax, %k1
3149; X86-NEXT:    vpmuludq %ymm1, %ymm2, %ymm0 {%k1}
3150; X86-NEXT:    retl
3151;
3152; X64-LABEL: test_mm256_mask_mul_epu32:
3153; X64:       # %bb.0: # %entry
3154; X64-NEXT:    kmovw %edi, %k1
3155; X64-NEXT:    vpmuludq %ymm1, %ymm2, %ymm0 {%k1}
3156; X64-NEXT:    retq
3157entry:
3158  %tmp = and <4 x i64> %__X, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
3159  %tmp1 = and <4 x i64> %__Y, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
3160  %tmp2 = mul nuw <4 x i64> %tmp1, %tmp
3161  %tmp3 = bitcast i8 %__M to <8 x i1>
3162  %extract.i = shufflevector <8 x i1> %tmp3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
3163  %tmp4 = select <4 x i1> %extract.i, <4 x i64> %tmp2, <4 x i64> %__W
3164  ret <4 x i64> %tmp4
3165}
3166
3167define <4 x i64> @test_mm256_maskz_mul_epu32(i8 zeroext %__M, <4 x i64> %__X, <4 x i64> %__Y) nounwind {
3168; X86-LABEL: test_mm256_maskz_mul_epu32:
3169; X86:       # %bb.0: # %entry
3170; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
3171; X86-NEXT:    kmovw %eax, %k1
3172; X86-NEXT:    vpmuludq %ymm0, %ymm1, %ymm0 {%k1} {z}
3173; X86-NEXT:    retl
3174;
3175; X64-LABEL: test_mm256_maskz_mul_epu32:
3176; X64:       # %bb.0: # %entry
3177; X64-NEXT:    kmovw %edi, %k1
3178; X64-NEXT:    vpmuludq %ymm0, %ymm1, %ymm0 {%k1} {z}
3179; X64-NEXT:    retq
3180entry:
3181  %tmp = and <4 x i64> %__X, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
3182  %tmp1 = and <4 x i64> %__Y, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
3183  %tmp2 = mul nuw <4 x i64> %tmp1, %tmp
3184  %tmp3 = bitcast i8 %__M to <8 x i1>
3185  %extract.i = shufflevector <8 x i1> %tmp3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
3186  %tmp4 = select <4 x i1> %extract.i, <4 x i64> %tmp2, <4 x i64> zeroinitializer
3187  ret <4 x i64> %tmp4
3188}
3189
3190define <2 x i64> @test_mm_mask_mul_epu32(<2 x i64> %__W, i8 zeroext %__M, <2 x i64> %__X, <2 x i64> %__Y) nounwind {
3191; X86-LABEL: test_mm_mask_mul_epu32:
3192; X86:       # %bb.0: # %entry
3193; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
3194; X86-NEXT:    kmovw %eax, %k1
3195; X86-NEXT:    vpmuludq %xmm1, %xmm2, %xmm0 {%k1}
3196; X86-NEXT:    retl
3197;
3198; X64-LABEL: test_mm_mask_mul_epu32:
3199; X64:       # %bb.0: # %entry
3200; X64-NEXT:    kmovw %edi, %k1
3201; X64-NEXT:    vpmuludq %xmm1, %xmm2, %xmm0 {%k1}
3202; X64-NEXT:    retq
3203entry:
3204  %tmp = and <2 x i64> %__X, <i64 4294967295, i64 4294967295>
3205  %tmp1 = and <2 x i64> %__Y, <i64 4294967295, i64 4294967295>
3206  %tmp2 = mul nuw <2 x i64> %tmp1, %tmp
3207  %tmp3 = bitcast i8 %__M to <8 x i1>
3208  %extract.i = shufflevector <8 x i1> %tmp3, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
3209  %tmp4 = select <2 x i1> %extract.i, <2 x i64> %tmp2, <2 x i64> %__W
3210  ret <2 x i64> %tmp4
3211}
3212
3213define <2 x i64> @test_mm_maskz_mul_epu32(i8 zeroext %__M, <2 x i64> %__X, <2 x i64> %__Y) nounwind {
3214; X86-LABEL: test_mm_maskz_mul_epu32:
3215; X86:       # %bb.0: # %entry
3216; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
3217; X86-NEXT:    kmovw %eax, %k1
3218; X86-NEXT:    vpmuludq %xmm0, %xmm1, %xmm0 {%k1} {z}
3219; X86-NEXT:    retl
3220;
3221; X64-LABEL: test_mm_maskz_mul_epu32:
3222; X64:       # %bb.0: # %entry
3223; X64-NEXT:    kmovw %edi, %k1
3224; X64-NEXT:    vpmuludq %xmm0, %xmm1, %xmm0 {%k1} {z}
3225; X64-NEXT:    retq
3226entry:
3227  %tmp = and <2 x i64> %__X, <i64 4294967295, i64 4294967295>
3228  %tmp1 = and <2 x i64> %__Y, <i64 4294967295, i64 4294967295>
3229  %tmp2 = mul nuw <2 x i64> %tmp1, %tmp
3230  %tmp3 = bitcast i8 %__M to <8 x i1>
3231  %extract.i = shufflevector <8 x i1> %tmp3, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
3232  %tmp4 = select <2 x i1> %extract.i, <2 x i64> %tmp2, <2 x i64> zeroinitializer
3233  ret <2 x i64> %tmp4
3234}
3235
3236define <2 x i64> @test_mm_cvtepi32_epi8(<2 x i64> %__A) {
3237; CHECK-LABEL: test_mm_cvtepi32_epi8:
3238; CHECK:       # %bb.0: # %entry
3239; CHECK-NEXT:    vpmovdb %xmm0, %xmm0
3240; CHECK-NEXT:    ret{{[l|q]}}
3241entry:
3242  %0 = bitcast <2 x i64> %__A to <4 x i32>
3243  %conv.i = trunc <4 x i32> %0 to <4 x i8>
3244  %shuf.i = shufflevector <4 x i8> %conv.i, <4 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
3245  %1 = bitcast <16 x i8> %shuf.i to <2 x i64>
3246  ret <2 x i64> %1
3247}
3248
3249define <2 x i64> @test_mm_cvtepi32_epi16(<2 x i64> %__A) {
3250; CHECK-LABEL: test_mm_cvtepi32_epi16:
3251; CHECK:       # %bb.0: # %entry
3252; CHECK-NEXT:    vpmovdw %xmm0, %xmm0
3253; CHECK-NEXT:    ret{{[l|q]}}
3254entry:
3255  %0 = bitcast <2 x i64> %__A to <4 x i32>
3256  %conv.i = trunc <4 x i32> %0 to <4 x i16>
3257  %shuf.i = shufflevector <4 x i16> %conv.i, <4 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
3258  %1 = bitcast <8 x i16> %shuf.i to <2 x i64>
3259  ret <2 x i64> %1
3260}
3261
3262define <2 x i64> @test_mm_cvtepi64_epi8(<2 x i64> %__A) {
3263; CHECK-LABEL: test_mm_cvtepi64_epi8:
3264; CHECK:       # %bb.0: # %entry
3265; CHECK-NEXT:    vpmovqb %xmm0, %xmm0
3266; CHECK-NEXT:    ret{{[l|q]}}
3267entry:
3268  %conv.i = trunc <2 x i64> %__A to <2 x i8>
3269  %shuf.i = shufflevector <2 x i8> %conv.i, <2 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
3270  %0 = bitcast <16 x i8> %shuf.i to <2 x i64>
3271  ret <2 x i64> %0
3272}
3273
3274define <2 x i64> @test_mm_cvtepi64_epi16(<2 x i64> %__A) {
3275; CHECK-LABEL: test_mm_cvtepi64_epi16:
3276; CHECK:       # %bb.0: # %entry
3277; CHECK-NEXT:    vpmovqw %xmm0, %xmm0
3278; CHECK-NEXT:    ret{{[l|q]}}
3279entry:
3280  %conv.i = trunc <2 x i64> %__A to <2 x i16>
3281  %shuf.i = shufflevector <2 x i16> %conv.i, <2 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
3282  %0 = bitcast <8 x i16> %shuf.i to <2 x i64>
3283  ret <2 x i64> %0
3284}
3285
3286define <2 x i64> @test_mm_cvtepi64_epi32(<2 x i64> %__A) {
3287; CHECK-LABEL: test_mm_cvtepi64_epi32:
3288; CHECK:       # %bb.0: # %entry
3289; CHECK-NEXT:    vpmovqd %xmm0, %xmm0
3290; CHECK-NEXT:    ret{{[l|q]}}
3291entry:
3292  %conv.i = trunc <2 x i64> %__A to <2 x i32>
3293  %shuf.i = shufflevector <2 x i32> %conv.i, <2 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
3294  %0 = bitcast <4 x i32> %shuf.i to <2 x i64>
3295  ret <2 x i64> %0
3296}
3297
3298define <2 x i64> @test_mm256_cvtepi32_epi16(<4 x i64> %__A) local_unnamed_addr #0 {
3299; CHECK-LABEL: test_mm256_cvtepi32_epi16:
3300; CHECK:       # %bb.0: # %entry
3301; CHECK-NEXT:    vpmovdw %ymm0, %xmm0
3302; CHECK-NEXT:    vzeroupper
3303; CHECK-NEXT:    ret{{[l|q]}}
3304entry:
3305  %0 = bitcast <4 x i64> %__A to <8 x i32>
3306  %conv.i = trunc <8 x i32> %0 to <8 x i16>
3307  %1 = bitcast <8 x i16> %conv.i to <2 x i64>
3308  ret <2 x i64> %1
3309}
3310
3311define <2 x i64> @test_mm256_mask_cvtepi32_epi16(<2 x i64> %__O, i8 zeroext %__M, <4 x i64> %__A) {
3312; X86-LABEL: test_mm256_mask_cvtepi32_epi16:
3313; X86:       # %bb.0: # %entry
3314; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
3315; X86-NEXT:    kmovw %eax, %k1
3316; X86-NEXT:    vpmovdw %ymm1, %xmm0 {%k1}
3317; X86-NEXT:    vzeroupper
3318; X86-NEXT:    retl
3319;
3320; X64-LABEL: test_mm256_mask_cvtepi32_epi16:
3321; X64:       # %bb.0: # %entry
3322; X64-NEXT:    kmovw %edi, %k1
3323; X64-NEXT:    vpmovdw %ymm1, %xmm0 {%k1}
3324; X64-NEXT:    vzeroupper
3325; X64-NEXT:    retq
3326entry:
3327  %0 = bitcast <4 x i64> %__A to <8 x i32>
3328  %1 = bitcast <2 x i64> %__O to <8 x i16>
3329  %2 = tail call <8 x i16> @llvm.x86.avx512.mask.pmov.dw.256(<8 x i32> %0, <8 x i16> %1, i8 %__M)
3330  %3 = bitcast <8 x i16> %2 to <2 x i64>
3331  ret <2 x i64> %3
3332}
3333
3334define <2 x i64> @test_mm256_maskz_cvtepi32_epi16(i8 zeroext %__M, <4 x i64> %__A) {
3335; X86-LABEL: test_mm256_maskz_cvtepi32_epi16:
3336; X86:       # %bb.0: # %entry
3337; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
3338; X86-NEXT:    kmovw %eax, %k1
3339; X86-NEXT:    vpmovdw %ymm0, %xmm0 {%k1} {z}
3340; X86-NEXT:    vzeroupper
3341; X86-NEXT:    retl
3342;
3343; X64-LABEL: test_mm256_maskz_cvtepi32_epi16:
3344; X64:       # %bb.0: # %entry
3345; X64-NEXT:    kmovw %edi, %k1
3346; X64-NEXT:    vpmovdw %ymm0, %xmm0 {%k1} {z}
3347; X64-NEXT:    vzeroupper
3348; X64-NEXT:    retq
3349entry:
3350  %0 = bitcast <4 x i64> %__A to <8 x i32>
3351  %1 = tail call <8 x i16> @llvm.x86.avx512.mask.pmov.dw.256(<8 x i32> %0, <8 x i16> zeroinitializer, i8 %__M)
3352  %2 = bitcast <8 x i16> %1 to <2 x i64>
3353  ret <2 x i64> %2
3354}
3355
3356define <2 x i64> @test_mm256_cvtepi64_epi32(<4 x i64> %__A) local_unnamed_addr #0 {
3357; CHECK-LABEL: test_mm256_cvtepi64_epi32:
3358; CHECK:       # %bb.0: # %entry
3359; CHECK-NEXT:    vpmovqd %ymm0, %xmm0
3360; CHECK-NEXT:    vzeroupper
3361; CHECK-NEXT:    ret{{[l|q]}}
3362entry:
3363  %conv.i = trunc <4 x i64> %__A to <4 x i32>
3364  %0 = bitcast <4 x i32> %conv.i to <2 x i64>
3365  ret <2 x i64> %0
3366}
3367
3368define <2 x i64> @test_mm256_mask_cvtepi64_epi32(<2 x i64> %__O, i8 zeroext %__M, <4 x i64> %__A) {
3369; X86-LABEL: test_mm256_mask_cvtepi64_epi32:
3370; X86:       # %bb.0: # %entry
3371; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
3372; X86-NEXT:    kmovw %eax, %k1
3373; X86-NEXT:    vpmovqd %ymm1, %xmm0 {%k1}
3374; X86-NEXT:    vzeroupper
3375; X86-NEXT:    retl
3376;
3377; X64-LABEL: test_mm256_mask_cvtepi64_epi32:
3378; X64:       # %bb.0: # %entry
3379; X64-NEXT:    kmovw %edi, %k1
3380; X64-NEXT:    vpmovqd %ymm1, %xmm0 {%k1}
3381; X64-NEXT:    vzeroupper
3382; X64-NEXT:    retq
3383entry:
3384  %conv.i.i = trunc <4 x i64> %__A to <4 x i32>
3385  %0 = bitcast <2 x i64> %__O to <4 x i32>
3386  %1 = bitcast i8 %__M to <8 x i1>
3387  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
3388  %2 = select <4 x i1> %extract.i, <4 x i32> %conv.i.i, <4 x i32> %0
3389  %3 = bitcast <4 x i32> %2 to <2 x i64>
3390  ret <2 x i64> %3
3391}
3392
3393define <2 x i64> @test_mm256_maskz_cvtepi64_epi32(i8 zeroext %__M, <4 x i64> %__A) {
3394; X86-LABEL: test_mm256_maskz_cvtepi64_epi32:
3395; X86:       # %bb.0: # %entry
3396; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
3397; X86-NEXT:    kmovw %eax, %k1
3398; X86-NEXT:    vpmovqd %ymm0, %xmm0 {%k1} {z}
3399; X86-NEXT:    vzeroupper
3400; X86-NEXT:    retl
3401;
3402; X64-LABEL: test_mm256_maskz_cvtepi64_epi32:
3403; X64:       # %bb.0: # %entry
3404; X64-NEXT:    kmovw %edi, %k1
3405; X64-NEXT:    vpmovqd %ymm0, %xmm0 {%k1} {z}
3406; X64-NEXT:    vzeroupper
3407; X64-NEXT:    retq
3408entry:
3409  %conv.i.i = trunc <4 x i64> %__A to <4 x i32>
3410  %0 = bitcast i8 %__M to <8 x i1>
3411  %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
3412  %1 = select <4 x i1> %extract.i, <4 x i32> %conv.i.i, <4 x i32> zeroinitializer
3413  %2 = bitcast <4 x i32> %1 to <2 x i64>
3414  ret <2 x i64> %2
3415}
3416
3417define <2 x i64> @test_mm256_cvtepi64_epi8(<4 x i64> %__A) {
3418; CHECK-LABEL: test_mm256_cvtepi64_epi8:
3419; CHECK:       # %bb.0: # %entry
3420; CHECK-NEXT:    vpmovqb %ymm0, %xmm0
3421; CHECK-NEXT:    vzeroupper
3422; CHECK-NEXT:    ret{{[l|q]}}
3423entry:
3424  %conv.i = trunc <4 x i64> %__A to <4 x i8>
3425  %shuf.i = shufflevector <4 x i8> %conv.i, <4 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
3426  %0 = bitcast <16 x i8> %shuf.i to <2 x i64>
3427  ret <2 x i64> %0
3428}
3429
3430define <2 x i64> @test_mm256_cvtepi64_epi16(<4 x i64> %__A) {
3431; CHECK-LABEL: test_mm256_cvtepi64_epi16:
3432; CHECK:       # %bb.0: # %entry
3433; CHECK-NEXT:    vpmovqw %ymm0, %xmm0
3434; CHECK-NEXT:    vzeroupper
3435; CHECK-NEXT:    ret{{[l|q]}}
3436entry:
3437  %conv.i = trunc <4 x i64> %__A to <4 x i16>
3438  %shuf.i = shufflevector <4 x i16> %conv.i, <4 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
3439  %0 = bitcast <8 x i16> %shuf.i to <2 x i64>
3440  ret <2 x i64> %0
3441}
3442
3443define <2 x i64> @test_mm256_cvtepi32_epi8(<4 x i64> %__A) {
3444; CHECK-LABEL: test_mm256_cvtepi32_epi8:
3445; CHECK:       # %bb.0: # %entry
3446; CHECK-NEXT:    vpmovdb %ymm0, %xmm0
3447; CHECK-NEXT:    vzeroupper
3448; CHECK-NEXT:    ret{{[l|q]}}
3449entry:
3450  %0 = bitcast <4 x i64> %__A to <8 x i32>
3451  %conv.i = trunc <8 x i32> %0 to <8 x i8>
3452  %shuf.i = shufflevector <8 x i8> %conv.i, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
3453  %1 = bitcast <16 x i8> %shuf.i to <2 x i64>
3454  ret <2 x i64> %1
3455}
3456
3457define <2 x i64> @test_mm_ternarylogic_epi32(<2 x i64> %__A, <2 x i64> %__B, <2 x i64> %__C) {
3458; CHECK-LABEL: test_mm_ternarylogic_epi32:
3459; CHECK:       # %bb.0: # %entry
3460; CHECK-NEXT:    vpternlogd $4, %xmm2, %xmm1, %xmm0
3461; CHECK-NEXT:    ret{{[l|q]}}
3462entry:
3463  %0 = bitcast <2 x i64> %__A to <4 x i32>
3464  %1 = bitcast <2 x i64> %__B to <4 x i32>
3465  %2 = bitcast <2 x i64> %__C to <4 x i32>
3466  %3 = tail call <4 x i32> @llvm.x86.avx512.pternlog.d.128(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2, i32 4)
3467  %4 = bitcast <4 x i32> %3 to <2 x i64>
3468  ret <2 x i64> %4
3469}
3470
3471declare <4 x i32> @llvm.x86.avx512.pternlog.d.128(<4 x i32>, <4 x i32>, <4 x i32>, i32) #2
3472
3473define <2 x i64> @test_mm_mask_ternarylogic_epi32(<2 x i64> %__A, i8 zeroext %__U, <2 x i64> %__B, <2 x i64> %__C) {
3474; X86-LABEL: test_mm_mask_ternarylogic_epi32:
3475; X86:       # %bb.0: # %entry
3476; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
3477; X86-NEXT:    kmovw %eax, %k1
3478; X86-NEXT:    vpternlogd $4, %xmm2, %xmm1, %xmm0 {%k1}
3479; X86-NEXT:    retl
3480;
3481; X64-LABEL: test_mm_mask_ternarylogic_epi32:
3482; X64:       # %bb.0: # %entry
3483; X64-NEXT:    kmovw %edi, %k1
3484; X64-NEXT:    vpternlogd $4, %xmm2, %xmm1, %xmm0 {%k1}
3485; X64-NEXT:    retq
3486entry:
3487  %0 = bitcast <2 x i64> %__A to <4 x i32>
3488  %1 = bitcast <2 x i64> %__B to <4 x i32>
3489  %2 = bitcast <2 x i64> %__C to <4 x i32>
3490  %3 = tail call <4 x i32> @llvm.x86.avx512.pternlog.d.128(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2, i32 4)
3491  %4 = bitcast i8 %__U to <8 x i1>
3492  %extract = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
3493  %5 = select <4 x i1> %extract, <4 x i32> %3, <4 x i32> %0
3494  %6 = bitcast <4 x i32> %5 to <2 x i64>
3495  ret <2 x i64> %6
3496}
3497
3498define <2 x i64> @test_mm_maskz_ternarylogic_epi32(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B, <2 x i64> %__C) {
3499; X86-LABEL: test_mm_maskz_ternarylogic_epi32:
3500; X86:       # %bb.0: # %entry
3501; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
3502; X86-NEXT:    kmovw %eax, %k1
3503; X86-NEXT:    vpternlogd $4, %xmm2, %xmm1, %xmm0 {%k1} {z}
3504; X86-NEXT:    retl
3505;
3506; X64-LABEL: test_mm_maskz_ternarylogic_epi32:
3507; X64:       # %bb.0: # %entry
3508; X64-NEXT:    kmovw %edi, %k1
3509; X64-NEXT:    vpternlogd $4, %xmm2, %xmm1, %xmm0 {%k1} {z}
3510; X64-NEXT:    retq
3511entry:
3512  %0 = bitcast <2 x i64> %__A to <4 x i32>
3513  %1 = bitcast <2 x i64> %__B to <4 x i32>
3514  %2 = bitcast <2 x i64> %__C to <4 x i32>
3515  %3 = tail call <4 x i32> @llvm.x86.avx512.pternlog.d.128(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2, i32 4)
3516  %4 = bitcast i8 %__U to <8 x i1>
3517  %extract = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
3518  %5 = select <4 x i1> %extract, <4 x i32> %3, <4 x i32> zeroinitializer
3519  %6 = bitcast <4 x i32> %5 to <2 x i64>
3520  ret <2 x i64> %6
3521}
3522
3523define <4 x i64> @test_mm256_ternarylogic_epi32(<4 x i64> %__A, <4 x i64> %__B, <4 x i64> %__C) {
3524; CHECK-LABEL: test_mm256_ternarylogic_epi32:
3525; CHECK:       # %bb.0: # %entry
3526; CHECK-NEXT:    vpternlogd $4, %ymm2, %ymm1, %ymm0
3527; CHECK-NEXT:    ret{{[l|q]}}
3528entry:
3529  %0 = bitcast <4 x i64> %__A to <8 x i32>
3530  %1 = bitcast <4 x i64> %__B to <8 x i32>
3531  %2 = bitcast <4 x i64> %__C to <8 x i32>
3532  %3 = tail call <8 x i32> @llvm.x86.avx512.pternlog.d.256(<8 x i32> %0, <8 x i32> %1, <8 x i32> %2, i32 4)
3533  %4 = bitcast <8 x i32> %3 to <4 x i64>
3534  ret <4 x i64> %4
3535}
3536
3537declare <8 x i32> @llvm.x86.avx512.pternlog.d.256(<8 x i32>, <8 x i32>, <8 x i32>, i32) #2
3538
3539define <4 x i64> @test_mm256_mask_ternarylogic_epi32(<4 x i64> %__A, i8 zeroext %__U, <4 x i64> %__B, <4 x i64> %__C) {
3540; X86-LABEL: test_mm256_mask_ternarylogic_epi32:
3541; X86:       # %bb.0: # %entry
3542; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
3543; X86-NEXT:    kmovw %eax, %k1
3544; X86-NEXT:    vpternlogd $4, %ymm2, %ymm1, %ymm0 {%k1}
3545; X86-NEXT:    retl
3546;
3547; X64-LABEL: test_mm256_mask_ternarylogic_epi32:
3548; X64:       # %bb.0: # %entry
3549; X64-NEXT:    kmovw %edi, %k1
3550; X64-NEXT:    vpternlogd $4, %ymm2, %ymm1, %ymm0 {%k1}
3551; X64-NEXT:    retq
3552entry:
3553  %0 = bitcast <4 x i64> %__A to <8 x i32>
3554  %1 = bitcast <4 x i64> %__B to <8 x i32>
3555  %2 = bitcast <4 x i64> %__C to <8 x i32>
3556  %3 = tail call <8 x i32> @llvm.x86.avx512.pternlog.d.256(<8 x i32> %0, <8 x i32> %1, <8 x i32> %2, i32 4)
3557  %4 = bitcast i8 %__U to <8 x i1>
3558  %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> %0
3559  %6 = bitcast <8 x i32> %5 to <4 x i64>
3560  ret <4 x i64> %6
3561}
3562
3563define <4 x i64> @test_mm256_maskz_ternarylogic_epi32(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B, <4 x i64> %__C) {
3564; X86-LABEL: test_mm256_maskz_ternarylogic_epi32:
3565; X86:       # %bb.0: # %entry
3566; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
3567; X86-NEXT:    kmovw %eax, %k1
3568; X86-NEXT:    vpternlogd $4, %ymm2, %ymm1, %ymm0 {%k1} {z}
3569; X86-NEXT:    retl
3570;
3571; X64-LABEL: test_mm256_maskz_ternarylogic_epi32:
3572; X64:       # %bb.0: # %entry
3573; X64-NEXT:    kmovw %edi, %k1
3574; X64-NEXT:    vpternlogd $4, %ymm2, %ymm1, %ymm0 {%k1} {z}
3575; X64-NEXT:    retq
3576entry:
3577  %0 = bitcast <4 x i64> %__A to <8 x i32>
3578  %1 = bitcast <4 x i64> %__B to <8 x i32>
3579  %2 = bitcast <4 x i64> %__C to <8 x i32>
3580  %3 = tail call <8 x i32> @llvm.x86.avx512.pternlog.d.256(<8 x i32> %0, <8 x i32> %1, <8 x i32> %2, i32 4)
3581  %4 = bitcast i8 %__U to <8 x i1>
3582  %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> zeroinitializer
3583  %6 = bitcast <8 x i32> %5 to <4 x i64>
3584  ret <4 x i64> %6
3585}
3586
3587define <2 x i64> @test_mm_ternarylogic_epi64(<2 x i64> %__A, <2 x i64> %__B, <2 x i64> %__C) {
3588; CHECK-LABEL: test_mm_ternarylogic_epi64:
3589; CHECK:       # %bb.0: # %entry
3590; CHECK-NEXT:    vpternlogq $4, %xmm2, %xmm1, %xmm0
3591; CHECK-NEXT:    ret{{[l|q]}}
3592entry:
3593  %0 = tail call <2 x i64> @llvm.x86.avx512.pternlog.q.128(<2 x i64> %__A, <2 x i64> %__B, <2 x i64> %__C, i32 4)
3594  ret <2 x i64> %0
3595}
3596
3597declare <2 x i64> @llvm.x86.avx512.pternlog.q.128(<2 x i64>, <2 x i64>, <2 x i64>, i32) #2
3598
3599define <2 x i64> @test_mm_mask_ternarylogic_epi64(<2 x i64> %__A, i8 zeroext %__U, <2 x i64> %__B, <2 x i64> %__C) {
3600; X86-LABEL: test_mm_mask_ternarylogic_epi64:
3601; X86:       # %bb.0: # %entry
3602; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
3603; X86-NEXT:    kmovw %eax, %k1
3604; X86-NEXT:    vpternlogq $4, %xmm2, %xmm1, %xmm0 {%k1}
3605; X86-NEXT:    retl
3606;
3607; X64-LABEL: test_mm_mask_ternarylogic_epi64:
3608; X64:       # %bb.0: # %entry
3609; X64-NEXT:    kmovw %edi, %k1
3610; X64-NEXT:    vpternlogq $4, %xmm2, %xmm1, %xmm0 {%k1}
3611; X64-NEXT:    retq
3612entry:
3613  %0 = tail call <2 x i64> @llvm.x86.avx512.pternlog.q.128(<2 x i64> %__A, <2 x i64> %__B, <2 x i64> %__C, i32 4)
3614  %1 = bitcast i8 %__U to <8 x i1>
3615  %extract = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
3616  %2 = select <2 x i1> %extract, <2 x i64> %0, <2 x i64> %__A
3617  ret <2 x i64> %2
3618}
3619
3620define <2 x i64> @test_mm_maskz_ternarylogic_epi64(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B, <2 x i64> %__C) {
3621; X86-LABEL: test_mm_maskz_ternarylogic_epi64:
3622; X86:       # %bb.0: # %entry
3623; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
3624; X86-NEXT:    kmovw %eax, %k1
3625; X86-NEXT:    vpternlogq $4, %xmm2, %xmm1, %xmm0 {%k1} {z}
3626; X86-NEXT:    retl
3627;
3628; X64-LABEL: test_mm_maskz_ternarylogic_epi64:
3629; X64:       # %bb.0: # %entry
3630; X64-NEXT:    kmovw %edi, %k1
3631; X64-NEXT:    vpternlogq $4, %xmm2, %xmm1, %xmm0 {%k1} {z}
3632; X64-NEXT:    retq
3633entry:
3634  %0 = tail call <2 x i64> @llvm.x86.avx512.pternlog.q.128(<2 x i64> %__A, <2 x i64> %__B, <2 x i64> %__C, i32 4)
3635  %1 = bitcast i8 %__U to <8 x i1>
3636  %extract = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
3637  %2 = select <2 x i1> %extract, <2 x i64> %0, <2 x i64> zeroinitializer
3638  ret <2 x i64> %2
3639}
3640
3641define <4 x i64> @test_mm256_ternarylogic_epi64(<4 x i64> %__A, <4 x i64> %__B, <4 x i64> %__C) {
3642; CHECK-LABEL: test_mm256_ternarylogic_epi64:
3643; CHECK:       # %bb.0: # %entry
3644; CHECK-NEXT:    vpternlogq $4, %ymm2, %ymm1, %ymm0
3645; CHECK-NEXT:    ret{{[l|q]}}
3646entry:
3647  %0 = tail call <4 x i64> @llvm.x86.avx512.pternlog.q.256(<4 x i64> %__A, <4 x i64> %__B, <4 x i64> %__C, i32 4)
3648  ret <4 x i64> %0
3649}
3650
3651declare <4 x i64> @llvm.x86.avx512.pternlog.q.256(<4 x i64>, <4 x i64>, <4 x i64>, i32) #2
3652
3653define <4 x i64> @test_mm256_mask_ternarylogic_epi64(<4 x i64> %__A, i8 zeroext %__U, <4 x i64> %__B, <4 x i64> %__C) {
3654; X86-LABEL: test_mm256_mask_ternarylogic_epi64:
3655; X86:       # %bb.0: # %entry
3656; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
3657; X86-NEXT:    kmovw %eax, %k1
3658; X86-NEXT:    vpternlogq $4, %ymm2, %ymm1, %ymm0 {%k1}
3659; X86-NEXT:    retl
3660;
3661; X64-LABEL: test_mm256_mask_ternarylogic_epi64:
3662; X64:       # %bb.0: # %entry
3663; X64-NEXT:    kmovw %edi, %k1
3664; X64-NEXT:    vpternlogq $4, %ymm2, %ymm1, %ymm0 {%k1}
3665; X64-NEXT:    retq
3666entry:
3667  %0 = tail call <4 x i64> @llvm.x86.avx512.pternlog.q.256(<4 x i64> %__A, <4 x i64> %__B, <4 x i64> %__C, i32 4)
3668  %1 = bitcast i8 %__U to <8 x i1>
3669  %extract = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
3670  %2 = select <4 x i1> %extract, <4 x i64> %0, <4 x i64> %__A
3671  ret <4 x i64> %2
3672}
3673
3674define <4 x i64> @test_mm256_maskz_ternarylogic_epi64(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B, <4 x i64> %__C) {
3675; X86-LABEL: test_mm256_maskz_ternarylogic_epi64:
3676; X86:       # %bb.0: # %entry
3677; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
3678; X86-NEXT:    kmovw %eax, %k1
3679; X86-NEXT:    vpternlogq $4, %ymm2, %ymm1, %ymm0 {%k1} {z}
3680; X86-NEXT:    retl
3681;
3682; X64-LABEL: test_mm256_maskz_ternarylogic_epi64:
3683; X64:       # %bb.0: # %entry
3684; X64-NEXT:    kmovw %edi, %k1
3685; X64-NEXT:    vpternlogq $4, %ymm2, %ymm1, %ymm0 {%k1} {z}
3686; X64-NEXT:    retq
3687entry:
3688  %0 = tail call <4 x i64> @llvm.x86.avx512.pternlog.q.256(<4 x i64> %__A, <4 x i64> %__B, <4 x i64> %__C, i32 4)
3689  %1 = bitcast i8 %__U to <8 x i1>
3690  %extract = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
3691  %2 = select <4 x i1> %extract, <4 x i64> %0, <4 x i64> zeroinitializer
3692  ret <4 x i64> %2
3693}
3694
3695define <2 x i64> @test_mm_mask2_permutex2var_epi32(<2 x i64> %__A, <2 x i64> %__I, i8 zeroext %__U, <2 x i64> %__B) {
3696; X86-LABEL: test_mm_mask2_permutex2var_epi32:
3697; X86:       # %bb.0: # %entry
3698; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
3699; X86-NEXT:    kmovw %eax, %k1
3700; X86-NEXT:    vpermi2d %xmm2, %xmm0, %xmm1 {%k1}
3701; X86-NEXT:    vmovdqa %xmm1, %xmm0
3702; X86-NEXT:    retl
3703;
3704; X64-LABEL: test_mm_mask2_permutex2var_epi32:
3705; X64:       # %bb.0: # %entry
3706; X64-NEXT:    kmovw %edi, %k1
3707; X64-NEXT:    vpermi2d %xmm2, %xmm0, %xmm1 {%k1}
3708; X64-NEXT:    vmovdqa %xmm1, %xmm0
3709; X64-NEXT:    retq
3710entry:
3711  %0 = bitcast <2 x i64> %__A to <4 x i32>
3712  %1 = bitcast <2 x i64> %__I to <4 x i32>
3713  %2 = bitcast <2 x i64> %__B to <4 x i32>
3714  %3 = tail call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2)
3715  %4 = bitcast i8 %__U to <8 x i1>
3716  %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
3717  %5 = select <4 x i1> %extract.i, <4 x i32> %3, <4 x i32> %1
3718  %6 = bitcast <4 x i32> %5 to <2 x i64>
3719  ret <2 x i64> %6
3720}
3721
3722define <4 x i64> @test_mm256_mask2_permutex2var_epi32(<4 x i64> %__A, <4 x i64> %__I, i8 zeroext %__U, <4 x i64> %__B) {
3723; X86-LABEL: test_mm256_mask2_permutex2var_epi32:
3724; X86:       # %bb.0: # %entry
3725; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
3726; X86-NEXT:    kmovw %eax, %k1
3727; X86-NEXT:    vpermi2d %ymm2, %ymm0, %ymm1 {%k1}
3728; X86-NEXT:    vmovdqa %ymm1, %ymm0
3729; X86-NEXT:    retl
3730;
3731; X64-LABEL: test_mm256_mask2_permutex2var_epi32:
3732; X64:       # %bb.0: # %entry
3733; X64-NEXT:    kmovw %edi, %k1
3734; X64-NEXT:    vpermi2d %ymm2, %ymm0, %ymm1 {%k1}
3735; X64-NEXT:    vmovdqa %ymm1, %ymm0
3736; X64-NEXT:    retq
3737entry:
3738  %0 = bitcast <4 x i64> %__A to <8 x i32>
3739  %1 = bitcast <4 x i64> %__I to <8 x i32>
3740  %2 = bitcast <4 x i64> %__B to <8 x i32>
3741  %3 = tail call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> %0, <8 x i32> %1, <8 x i32> %2)
3742  %4 = bitcast i8 %__U to <8 x i1>
3743  %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> %1
3744  %6 = bitcast <8 x i32> %5 to <4 x i64>
3745  ret <4 x i64> %6
3746}
3747
3748define <2 x double> @test_mm_mask2_permutex2var_pd(<2 x double> %__A, <2 x i64> %__I, i8 zeroext %__U, <2 x double> %__B) {
3749; X86-LABEL: test_mm_mask2_permutex2var_pd:
3750; X86:       # %bb.0: # %entry
3751; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
3752; X86-NEXT:    kmovw %eax, %k1
3753; X86-NEXT:    vpermi2pd %xmm2, %xmm0, %xmm1 {%k1}
3754; X86-NEXT:    vmovapd %xmm1, %xmm0
3755; X86-NEXT:    retl
3756;
3757; X64-LABEL: test_mm_mask2_permutex2var_pd:
3758; X64:       # %bb.0: # %entry
3759; X64-NEXT:    kmovw %edi, %k1
3760; X64-NEXT:    vpermi2pd %xmm2, %xmm0, %xmm1 {%k1}
3761; X64-NEXT:    vmovapd %xmm1, %xmm0
3762; X64-NEXT:    retq
3763entry:
3764  %0 = tail call <2 x double> @llvm.x86.avx512.vpermi2var.pd.128(<2 x double> %__A, <2 x i64> %__I, <2 x double> %__B)
3765  %1 = bitcast <2 x i64> %__I to <2 x double>
3766  %2 = bitcast i8 %__U to <8 x i1>
3767  %extract.i = shufflevector <8 x i1> %2, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
3768  %3 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> %1
3769  ret <2 x double> %3
3770}
3771
3772define <4 x double> @test_mm256_mask2_permutex2var_pd(<4 x double> %__A, <4 x i64> %__I, i8 zeroext %__U, <4 x double> %__B) {
3773; X86-LABEL: test_mm256_mask2_permutex2var_pd:
3774; X86:       # %bb.0: # %entry
3775; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
3776; X86-NEXT:    kmovw %eax, %k1
3777; X86-NEXT:    vpermi2pd %ymm2, %ymm0, %ymm1 {%k1}
3778; X86-NEXT:    vmovapd %ymm1, %ymm0
3779; X86-NEXT:    retl
3780;
3781; X64-LABEL: test_mm256_mask2_permutex2var_pd:
3782; X64:       # %bb.0: # %entry
3783; X64-NEXT:    kmovw %edi, %k1
3784; X64-NEXT:    vpermi2pd %ymm2, %ymm0, %ymm1 {%k1}
3785; X64-NEXT:    vmovapd %ymm1, %ymm0
3786; X64-NEXT:    retq
3787entry:
3788  %0 = tail call <4 x double> @llvm.x86.avx512.vpermi2var.pd.256(<4 x double> %__A, <4 x i64> %__I, <4 x double> %__B)
3789  %1 = bitcast <4 x i64> %__I to <4 x double>
3790  %2 = bitcast i8 %__U to <8 x i1>
3791  %extract.i = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
3792  %3 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> %1
3793  ret <4 x double> %3
3794}
3795
3796define <4 x float> @test_mm_mask2_permutex2var_ps(<4 x float> %__A, <2 x i64> %__I, i8 zeroext %__U, <4 x float> %__B) {
3797; X86-LABEL: test_mm_mask2_permutex2var_ps:
3798; X86:       # %bb.0: # %entry
3799; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
3800; X86-NEXT:    kmovw %eax, %k1
3801; X86-NEXT:    vpermi2ps %xmm2, %xmm0, %xmm1 {%k1}
3802; X86-NEXT:    vmovaps %xmm1, %xmm0
3803; X86-NEXT:    retl
3804;
3805; X64-LABEL: test_mm_mask2_permutex2var_ps:
3806; X64:       # %bb.0: # %entry
3807; X64-NEXT:    kmovw %edi, %k1
3808; X64-NEXT:    vpermi2ps %xmm2, %xmm0, %xmm1 {%k1}
3809; X64-NEXT:    vmovaps %xmm1, %xmm0
3810; X64-NEXT:    retq
3811entry:
3812  %0 = bitcast <2 x i64> %__I to <4 x i32>
3813  %1 = tail call <4 x float> @llvm.x86.avx512.vpermi2var.ps.128(<4 x float> %__A, <4 x i32> %0, <4 x float> %__B)
3814  %2 = bitcast <2 x i64> %__I to <4 x float>
3815  %3 = bitcast i8 %__U to <8 x i1>
3816  %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
3817  %4 = select <4 x i1> %extract.i, <4 x float> %1, <4 x float> %2
3818  ret <4 x float> %4
3819}
3820
3821define <8 x float> @test_mm256_mask2_permutex2var_ps(<8 x float> %__A, <4 x i64> %__I, i8 zeroext %__U, <8 x float> %__B) {
3822; X86-LABEL: test_mm256_mask2_permutex2var_ps:
3823; X86:       # %bb.0: # %entry
3824; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
3825; X86-NEXT:    kmovw %eax, %k1
3826; X86-NEXT:    vpermi2ps %ymm2, %ymm0, %ymm1 {%k1}
3827; X86-NEXT:    vmovaps %ymm1, %ymm0
3828; X86-NEXT:    retl
3829;
3830; X64-LABEL: test_mm256_mask2_permutex2var_ps:
3831; X64:       # %bb.0: # %entry
3832; X64-NEXT:    kmovw %edi, %k1
3833; X64-NEXT:    vpermi2ps %ymm2, %ymm0, %ymm1 {%k1}
3834; X64-NEXT:    vmovaps %ymm1, %ymm0
3835; X64-NEXT:    retq
3836entry:
3837  %0 = bitcast <4 x i64> %__I to <8 x i32>
3838  %1 = tail call <8 x float> @llvm.x86.avx512.vpermi2var.ps.256(<8 x float> %__A, <8 x i32> %0, <8 x float> %__B)
3839  %2 = bitcast <4 x i64> %__I to <8 x float>
3840  %3 = bitcast i8 %__U to <8 x i1>
3841  %4 = select <8 x i1> %3, <8 x float> %1, <8 x float> %2
3842  ret <8 x float> %4
3843}
3844
3845define <2 x i64> @test_mm_mask2_permutex2var_epi64(<2 x i64> %__A, <2 x i64> %__I, i8 zeroext %__U, <2 x i64> %__B) {
3846; X86-LABEL: test_mm_mask2_permutex2var_epi64:
3847; X86:       # %bb.0: # %entry
3848; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
3849; X86-NEXT:    kmovw %eax, %k1
3850; X86-NEXT:    vpermi2q %xmm2, %xmm0, %xmm1 {%k1}
3851; X86-NEXT:    vmovdqa %xmm1, %xmm0
3852; X86-NEXT:    retl
3853;
3854; X64-LABEL: test_mm_mask2_permutex2var_epi64:
3855; X64:       # %bb.0: # %entry
3856; X64-NEXT:    kmovw %edi, %k1
3857; X64-NEXT:    vpermi2q %xmm2, %xmm0, %xmm1 {%k1}
3858; X64-NEXT:    vmovdqa %xmm1, %xmm0
3859; X64-NEXT:    retq
3860entry:
3861  %0 = tail call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> %__A, <2 x i64> %__I, <2 x i64> %__B)
3862  %1 = bitcast i8 %__U to <8 x i1>
3863  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
3864  %2 = select <2 x i1> %extract.i, <2 x i64> %0, <2 x i64> %__I
3865  ret <2 x i64> %2
3866}
3867
3868define <4 x i64> @test_mm256_mask2_permutex2var_epi64(<4 x i64> %__A, <4 x i64> %__I, i8 zeroext %__U, <4 x i64> %__B) {
3869; X86-LABEL: test_mm256_mask2_permutex2var_epi64:
3870; X86:       # %bb.0: # %entry
3871; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
3872; X86-NEXT:    kmovw %eax, %k1
3873; X86-NEXT:    vpermi2q %ymm2, %ymm0, %ymm1 {%k1}
3874; X86-NEXT:    vmovdqa %ymm1, %ymm0
3875; X86-NEXT:    retl
3876;
3877; X64-LABEL: test_mm256_mask2_permutex2var_epi64:
3878; X64:       # %bb.0: # %entry
3879; X64-NEXT:    kmovw %edi, %k1
3880; X64-NEXT:    vpermi2q %ymm2, %ymm0, %ymm1 {%k1}
3881; X64-NEXT:    vmovdqa %ymm1, %ymm0
3882; X64-NEXT:    retq
3883entry:
3884  %0 = tail call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> %__A, <4 x i64> %__I, <4 x i64> %__B)
3885  %1 = bitcast i8 %__U to <8 x i1>
3886  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
3887  %2 = select <4 x i1> %extract.i, <4 x i64> %0, <4 x i64> %__I
3888  ret <4 x i64> %2
3889}
3890
3891define <2 x i64> @test_mm_permutex2var_epi32(<2 x i64> %__A, <2 x i64> %__I, <2 x i64> %__B) {
3892; CHECK-LABEL: test_mm_permutex2var_epi32:
3893; CHECK:       # %bb.0: # %entry
3894; CHECK-NEXT:    vpermt2d %xmm2, %xmm1, %xmm0
3895; CHECK-NEXT:    ret{{[l|q]}}
3896entry:
3897  %0 = bitcast <2 x i64> %__A to <4 x i32>
3898  %1 = bitcast <2 x i64> %__I to <4 x i32>
3899  %2 = bitcast <2 x i64> %__B to <4 x i32>
3900  %3 = tail call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2)
3901  %4 = bitcast <4 x i32> %3 to <2 x i64>
3902  ret <2 x i64> %4
3903}
3904
3905define <2 x i64> @test_mm_mask_permutex2var_epi32(<2 x i64> %__A, i8 zeroext %__U, <2 x i64> %__I, <2 x i64> %__B) {
3906; X86-LABEL: test_mm_mask_permutex2var_epi32:
3907; X86:       # %bb.0: # %entry
3908; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
3909; X86-NEXT:    kmovw %eax, %k1
3910; X86-NEXT:    vpermt2d %xmm2, %xmm1, %xmm0 {%k1}
3911; X86-NEXT:    retl
3912;
3913; X64-LABEL: test_mm_mask_permutex2var_epi32:
3914; X64:       # %bb.0: # %entry
3915; X64-NEXT:    kmovw %edi, %k1
3916; X64-NEXT:    vpermt2d %xmm2, %xmm1, %xmm0 {%k1}
3917; X64-NEXT:    retq
3918entry:
3919  %0 = bitcast <2 x i64> %__A to <4 x i32>
3920  %1 = bitcast <2 x i64> %__I to <4 x i32>
3921  %2 = bitcast <2 x i64> %__B to <4 x i32>
3922  %3 = tail call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2)
3923  %4 = bitcast i8 %__U to <8 x i1>
3924  %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
3925  %5 = select <4 x i1> %extract.i, <4 x i32> %3, <4 x i32> %0
3926  %6 = bitcast <4 x i32> %5 to <2 x i64>
3927  ret <2 x i64> %6
3928}
3929
3930define <2 x i64> @test_mm_maskz_permutex2var_epi32(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__I, <2 x i64> %__B) {
3931; X86-LABEL: test_mm_maskz_permutex2var_epi32:
3932; X86:       # %bb.0: # %entry
3933; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
3934; X86-NEXT:    kmovw %eax, %k1
3935; X86-NEXT:    vpermt2d %xmm2, %xmm1, %xmm0 {%k1} {z}
3936; X86-NEXT:    retl
3937;
3938; X64-LABEL: test_mm_maskz_permutex2var_epi32:
3939; X64:       # %bb.0: # %entry
3940; X64-NEXT:    kmovw %edi, %k1
3941; X64-NEXT:    vpermt2d %xmm2, %xmm1, %xmm0 {%k1} {z}
3942; X64-NEXT:    retq
3943entry:
3944  %0 = bitcast <2 x i64> %__A to <4 x i32>
3945  %1 = bitcast <2 x i64> %__I to <4 x i32>
3946  %2 = bitcast <2 x i64> %__B to <4 x i32>
3947  %3 = tail call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2)
3948  %4 = bitcast i8 %__U to <8 x i1>
3949  %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
3950  %5 = select <4 x i1> %extract.i, <4 x i32> %3, <4 x i32> zeroinitializer
3951  %6 = bitcast <4 x i32> %5 to <2 x i64>
3952  ret <2 x i64> %6
3953}
3954
3955define <4 x i64> @test_mm256_permutex2var_epi32(<4 x i64> %__A, <4 x i64> %__I, <4 x i64> %__B) {
3956; CHECK-LABEL: test_mm256_permutex2var_epi32:
3957; CHECK:       # %bb.0: # %entry
3958; CHECK-NEXT:    vpermt2d %ymm2, %ymm1, %ymm0
3959; CHECK-NEXT:    ret{{[l|q]}}
3960entry:
3961  %0 = bitcast <4 x i64> %__A to <8 x i32>
3962  %1 = bitcast <4 x i64> %__I to <8 x i32>
3963  %2 = bitcast <4 x i64> %__B to <8 x i32>
3964  %3 = tail call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> %0, <8 x i32> %1, <8 x i32> %2)
3965  %4 = bitcast <8 x i32> %3 to <4 x i64>
3966  ret <4 x i64> %4
3967}
3968
3969define <4 x i64> @test_mm256_mask_permutex2var_epi32(<4 x i64> %__A, i8 zeroext %__U, <4 x i64> %__I, <4 x i64> %__B) {
3970; X86-LABEL: test_mm256_mask_permutex2var_epi32:
3971; X86:       # %bb.0: # %entry
3972; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
3973; X86-NEXT:    kmovw %eax, %k1
3974; X86-NEXT:    vpermt2d %ymm2, %ymm1, %ymm0 {%k1}
3975; X86-NEXT:    retl
3976;
3977; X64-LABEL: test_mm256_mask_permutex2var_epi32:
3978; X64:       # %bb.0: # %entry
3979; X64-NEXT:    kmovw %edi, %k1
3980; X64-NEXT:    vpermt2d %ymm2, %ymm1, %ymm0 {%k1}
3981; X64-NEXT:    retq
3982entry:
3983  %0 = bitcast <4 x i64> %__A to <8 x i32>
3984  %1 = bitcast <4 x i64> %__I to <8 x i32>
3985  %2 = bitcast <4 x i64> %__B to <8 x i32>
3986  %3 = tail call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> %0, <8 x i32> %1, <8 x i32> %2)
3987  %4 = bitcast i8 %__U to <8 x i1>
3988  %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> %0
3989  %6 = bitcast <8 x i32> %5 to <4 x i64>
3990  ret <4 x i64> %6
3991}
3992
3993define <4 x i64> @test_mm256_maskz_permutex2var_epi32(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__I, <4 x i64> %__B) {
3994; X86-LABEL: test_mm256_maskz_permutex2var_epi32:
3995; X86:       # %bb.0: # %entry
3996; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
3997; X86-NEXT:    kmovw %eax, %k1
3998; X86-NEXT:    vpermt2d %ymm2, %ymm1, %ymm0 {%k1} {z}
3999; X86-NEXT:    retl
4000;
4001; X64-LABEL: test_mm256_maskz_permutex2var_epi32:
4002; X64:       # %bb.0: # %entry
4003; X64-NEXT:    kmovw %edi, %k1
4004; X64-NEXT:    vpermt2d %ymm2, %ymm1, %ymm0 {%k1} {z}
4005; X64-NEXT:    retq
4006entry:
4007  %0 = bitcast <4 x i64> %__A to <8 x i32>
4008  %1 = bitcast <4 x i64> %__I to <8 x i32>
4009  %2 = bitcast <4 x i64> %__B to <8 x i32>
4010  %3 = tail call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> %0, <8 x i32> %1, <8 x i32> %2)
4011  %4 = bitcast i8 %__U to <8 x i1>
4012  %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> zeroinitializer
4013  %6 = bitcast <8 x i32> %5 to <4 x i64>
4014  ret <4 x i64> %6
4015}
4016
4017define <2 x double> @test_mm_permutex2var_pd(<2 x double> %__A, <2 x i64> %__I, <2 x double> %__B) {
4018; CHECK-LABEL: test_mm_permutex2var_pd:
4019; CHECK:       # %bb.0: # %entry
4020; CHECK-NEXT:    vpermt2pd %xmm2, %xmm1, %xmm0
4021; CHECK-NEXT:    ret{{[l|q]}}
4022entry:
4023  %0 = tail call <2 x double> @llvm.x86.avx512.vpermi2var.pd.128(<2 x double> %__A, <2 x i64> %__I, <2 x double> %__B)
4024  ret <2 x double> %0
4025}
4026
4027define <2 x double> @test_mm_mask_permutex2var_pd(<2 x double> %__A, i8 zeroext %__U, <2 x i64> %__I, <2 x double> %__B) {
4028; X86-LABEL: test_mm_mask_permutex2var_pd:
4029; X86:       # %bb.0: # %entry
4030; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
4031; X86-NEXT:    kmovw %eax, %k1
4032; X86-NEXT:    vpermt2pd %xmm2, %xmm1, %xmm0 {%k1}
4033; X86-NEXT:    retl
4034;
4035; X64-LABEL: test_mm_mask_permutex2var_pd:
4036; X64:       # %bb.0: # %entry
4037; X64-NEXT:    kmovw %edi, %k1
4038; X64-NEXT:    vpermt2pd %xmm2, %xmm1, %xmm0 {%k1}
4039; X64-NEXT:    retq
4040entry:
4041  %0 = tail call <2 x double> @llvm.x86.avx512.vpermi2var.pd.128(<2 x double> %__A, <2 x i64> %__I, <2 x double> %__B)
4042  %1 = bitcast i8 %__U to <8 x i1>
4043  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
4044  %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> %__A
4045  ret <2 x double> %2
4046}
4047
4048define <2 x double> @test_mm_maskz_permutex2var_pd(i8 zeroext %__U, <2 x double> %__A, <2 x i64> %__I, <2 x double> %__B) {
4049; X86-LABEL: test_mm_maskz_permutex2var_pd:
4050; X86:       # %bb.0: # %entry
4051; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
4052; X86-NEXT:    kmovw %eax, %k1
4053; X86-NEXT:    vpermt2pd %xmm2, %xmm1, %xmm0 {%k1} {z}
4054; X86-NEXT:    retl
4055;
4056; X64-LABEL: test_mm_maskz_permutex2var_pd:
4057; X64:       # %bb.0: # %entry
4058; X64-NEXT:    kmovw %edi, %k1
4059; X64-NEXT:    vpermt2pd %xmm2, %xmm1, %xmm0 {%k1} {z}
4060; X64-NEXT:    retq
4061entry:
4062  %0 = tail call <2 x double> @llvm.x86.avx512.vpermi2var.pd.128(<2 x double> %__A, <2 x i64> %__I, <2 x double> %__B)
4063  %1 = bitcast i8 %__U to <8 x i1>
4064  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
4065  %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> zeroinitializer
4066  ret <2 x double> %2
4067}
4068
4069define <4 x double> @test_mm256_permutex2var_pd(<4 x double> %__A, <4 x i64> %__I, <4 x double> %__B) {
4070; CHECK-LABEL: test_mm256_permutex2var_pd:
4071; CHECK:       # %bb.0: # %entry
4072; CHECK-NEXT:    vpermt2pd %ymm2, %ymm1, %ymm0
4073; CHECK-NEXT:    ret{{[l|q]}}
4074entry:
4075  %0 = tail call <4 x double> @llvm.x86.avx512.vpermi2var.pd.256(<4 x double> %__A, <4 x i64> %__I, <4 x double> %__B)
4076  ret <4 x double> %0
4077}
4078
4079define <4 x double> @test_mm256_mask_permutex2var_pd(<4 x double> %__A, i8 zeroext %__U, <4 x i64> %__I, <4 x double> %__B) {
4080; X86-LABEL: test_mm256_mask_permutex2var_pd:
4081; X86:       # %bb.0: # %entry
4082; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
4083; X86-NEXT:    kmovw %eax, %k1
4084; X86-NEXT:    vpermt2pd %ymm2, %ymm1, %ymm0 {%k1}
4085; X86-NEXT:    retl
4086;
4087; X64-LABEL: test_mm256_mask_permutex2var_pd:
4088; X64:       # %bb.0: # %entry
4089; X64-NEXT:    kmovw %edi, %k1
4090; X64-NEXT:    vpermt2pd %ymm2, %ymm1, %ymm0 {%k1}
4091; X64-NEXT:    retq
4092entry:
4093  %0 = tail call <4 x double> @llvm.x86.avx512.vpermi2var.pd.256(<4 x double> %__A, <4 x i64> %__I, <4 x double> %__B)
4094  %1 = bitcast i8 %__U to <8 x i1>
4095  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
4096  %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> %__A
4097  ret <4 x double> %2
4098}
4099
4100define <4 x double> @test_mm256_maskz_permutex2var_pd(i8 zeroext %__U, <4 x double> %__A, <4 x i64> %__I, <4 x double> %__B) {
4101; X86-LABEL: test_mm256_maskz_permutex2var_pd:
4102; X86:       # %bb.0: # %entry
4103; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
4104; X86-NEXT:    kmovw %eax, %k1
4105; X86-NEXT:    vpermt2pd %ymm2, %ymm1, %ymm0 {%k1} {z}
4106; X86-NEXT:    retl
4107;
4108; X64-LABEL: test_mm256_maskz_permutex2var_pd:
4109; X64:       # %bb.0: # %entry
4110; X64-NEXT:    kmovw %edi, %k1
4111; X64-NEXT:    vpermt2pd %ymm2, %ymm1, %ymm0 {%k1} {z}
4112; X64-NEXT:    retq
4113entry:
4114  %0 = tail call <4 x double> @llvm.x86.avx512.vpermi2var.pd.256(<4 x double> %__A, <4 x i64> %__I, <4 x double> %__B)
4115  %1 = bitcast i8 %__U to <8 x i1>
4116  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
4117  %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> zeroinitializer
4118  ret <4 x double> %2
4119}
4120
4121define <4 x float> @test_mm_permutex2var_ps(<4 x float> %__A, <2 x i64> %__I, <4 x float> %__B) {
4122; CHECK-LABEL: test_mm_permutex2var_ps:
4123; CHECK:       # %bb.0: # %entry
4124; CHECK-NEXT:    vpermt2ps %xmm2, %xmm1, %xmm0
4125; CHECK-NEXT:    ret{{[l|q]}}
4126entry:
4127  %0 = bitcast <2 x i64> %__I to <4 x i32>
4128  %1 = tail call <4 x float> @llvm.x86.avx512.vpermi2var.ps.128(<4 x float> %__A, <4 x i32> %0, <4 x float> %__B)
4129  ret <4 x float> %1
4130}
4131
4132define <4 x float> @test_mm_mask_permutex2var_ps(<4 x float> %__A, i8 zeroext %__U, <2 x i64> %__I, <4 x float> %__B) {
4133; X86-LABEL: test_mm_mask_permutex2var_ps:
4134; X86:       # %bb.0: # %entry
4135; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
4136; X86-NEXT:    kmovw %eax, %k1
4137; X86-NEXT:    vpermt2ps %xmm2, %xmm1, %xmm0 {%k1}
4138; X86-NEXT:    retl
4139;
4140; X64-LABEL: test_mm_mask_permutex2var_ps:
4141; X64:       # %bb.0: # %entry
4142; X64-NEXT:    kmovw %edi, %k1
4143; X64-NEXT:    vpermt2ps %xmm2, %xmm1, %xmm0 {%k1}
4144; X64-NEXT:    retq
4145entry:
4146  %0 = bitcast <2 x i64> %__I to <4 x i32>
4147  %1 = tail call <4 x float> @llvm.x86.avx512.vpermi2var.ps.128(<4 x float> %__A, <4 x i32> %0, <4 x float> %__B)
4148  %2 = bitcast i8 %__U to <8 x i1>
4149  %extract.i = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
4150  %3 = select <4 x i1> %extract.i, <4 x float> %1, <4 x float> %__A
4151  ret <4 x float> %3
4152}
4153
4154define <4 x float> @test_mm_maskz_permutex2var_ps(i8 zeroext %__U, <4 x float> %__A, <2 x i64> %__I, <4 x float> %__B) {
4155; X86-LABEL: test_mm_maskz_permutex2var_ps:
4156; X86:       # %bb.0: # %entry
4157; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
4158; X86-NEXT:    kmovw %eax, %k1
4159; X86-NEXT:    vpermt2ps %xmm2, %xmm1, %xmm0 {%k1} {z}
4160; X86-NEXT:    retl
4161;
4162; X64-LABEL: test_mm_maskz_permutex2var_ps:
4163; X64:       # %bb.0: # %entry
4164; X64-NEXT:    kmovw %edi, %k1
4165; X64-NEXT:    vpermt2ps %xmm2, %xmm1, %xmm0 {%k1} {z}
4166; X64-NEXT:    retq
4167entry:
4168  %0 = bitcast <2 x i64> %__I to <4 x i32>
4169  %1 = tail call <4 x float> @llvm.x86.avx512.vpermi2var.ps.128(<4 x float> %__A, <4 x i32> %0, <4 x float> %__B)
4170  %2 = bitcast i8 %__U to <8 x i1>
4171  %extract.i = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
4172  %3 = select <4 x i1> %extract.i, <4 x float> %1, <4 x float> zeroinitializer
4173  ret <4 x float> %3
4174}
4175
4176define <8 x float> @test_mm256_permutex2var_ps(<8 x float> %__A, <4 x i64> %__I, <8 x float> %__B) {
4177; CHECK-LABEL: test_mm256_permutex2var_ps:
4178; CHECK:       # %bb.0: # %entry
4179; CHECK-NEXT:    vpermt2ps %ymm2, %ymm1, %ymm0
4180; CHECK-NEXT:    ret{{[l|q]}}
4181entry:
4182  %0 = bitcast <4 x i64> %__I to <8 x i32>
4183  %1 = tail call <8 x float> @llvm.x86.avx512.vpermi2var.ps.256(<8 x float> %__A, <8 x i32> %0, <8 x float> %__B)
4184  ret <8 x float> %1
4185}
4186
4187define <8 x float> @test_mm256_mask_permutex2var_ps(<8 x float> %__A, i8 zeroext %__U, <4 x i64> %__I, <8 x float> %__B) {
4188; X86-LABEL: test_mm256_mask_permutex2var_ps:
4189; X86:       # %bb.0: # %entry
4190; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
4191; X86-NEXT:    kmovw %eax, %k1
4192; X86-NEXT:    vpermt2ps %ymm2, %ymm1, %ymm0 {%k1}
4193; X86-NEXT:    retl
4194;
4195; X64-LABEL: test_mm256_mask_permutex2var_ps:
4196; X64:       # %bb.0: # %entry
4197; X64-NEXT:    kmovw %edi, %k1
4198; X64-NEXT:    vpermt2ps %ymm2, %ymm1, %ymm0 {%k1}
4199; X64-NEXT:    retq
4200entry:
4201  %0 = bitcast <4 x i64> %__I to <8 x i32>
4202  %1 = tail call <8 x float> @llvm.x86.avx512.vpermi2var.ps.256(<8 x float> %__A, <8 x i32> %0, <8 x float> %__B)
4203  %2 = bitcast i8 %__U to <8 x i1>
4204  %3 = select <8 x i1> %2, <8 x float> %1, <8 x float> %__A
4205  ret <8 x float> %3
4206}
4207
4208define <8 x float> @test_mm256_maskz_permutex2var_ps(i8 zeroext %__U, <8 x float> %__A, <4 x i64> %__I, <8 x float> %__B) {
4209; X86-LABEL: test_mm256_maskz_permutex2var_ps:
4210; X86:       # %bb.0: # %entry
4211; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
4212; X86-NEXT:    kmovw %eax, %k1
4213; X86-NEXT:    vpermt2ps %ymm2, %ymm1, %ymm0 {%k1} {z}
4214; X86-NEXT:    retl
4215;
4216; X64-LABEL: test_mm256_maskz_permutex2var_ps:
4217; X64:       # %bb.0: # %entry
4218; X64-NEXT:    kmovw %edi, %k1
4219; X64-NEXT:    vpermt2ps %ymm2, %ymm1, %ymm0 {%k1} {z}
4220; X64-NEXT:    retq
4221entry:
4222  %0 = bitcast <4 x i64> %__I to <8 x i32>
4223  %1 = tail call <8 x float> @llvm.x86.avx512.vpermi2var.ps.256(<8 x float> %__A, <8 x i32> %0, <8 x float> %__B)
4224  %2 = bitcast i8 %__U to <8 x i1>
4225  %3 = select <8 x i1> %2, <8 x float> %1, <8 x float> zeroinitializer
4226  ret <8 x float> %3
4227}
4228
4229define <2 x i64> @test_mm_permutex2var_epi64(<2 x i64> %__A, <2 x i64> %__I, <2 x i64> %__B) {
4230; CHECK-LABEL: test_mm_permutex2var_epi64:
4231; CHECK:       # %bb.0: # %entry
4232; CHECK-NEXT:    vpermt2q %xmm2, %xmm1, %xmm0
4233; CHECK-NEXT:    ret{{[l|q]}}
4234entry:
4235  %0 = tail call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> %__A, <2 x i64> %__I, <2 x i64> %__B)
4236  ret <2 x i64> %0
4237}
4238
4239define <2 x i64> @test_mm_mask_permutex2var_epi64(<2 x i64> %__A, i8 zeroext %__U, <2 x i64> %__I, <2 x i64> %__B) {
4240; X86-LABEL: test_mm_mask_permutex2var_epi64:
4241; X86:       # %bb.0: # %entry
4242; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
4243; X86-NEXT:    kmovw %eax, %k1
4244; X86-NEXT:    vpermt2q %xmm2, %xmm1, %xmm0 {%k1}
4245; X86-NEXT:    retl
4246;
4247; X64-LABEL: test_mm_mask_permutex2var_epi64:
4248; X64:       # %bb.0: # %entry
4249; X64-NEXT:    kmovw %edi, %k1
4250; X64-NEXT:    vpermt2q %xmm2, %xmm1, %xmm0 {%k1}
4251; X64-NEXT:    retq
4252entry:
4253  %0 = tail call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> %__A, <2 x i64> %__I, <2 x i64> %__B)
4254  %1 = bitcast i8 %__U to <8 x i1>
4255  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
4256  %2 = select <2 x i1> %extract.i, <2 x i64> %0, <2 x i64> %__A
4257  ret <2 x i64> %2
4258}
4259
4260define <2 x i64> @test_mm_maskz_permutex2var_epi64(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__I, <2 x i64> %__B) {
4261; X86-LABEL: test_mm_maskz_permutex2var_epi64:
4262; X86:       # %bb.0: # %entry
4263; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
4264; X86-NEXT:    kmovw %eax, %k1
4265; X86-NEXT:    vpermt2q %xmm2, %xmm1, %xmm0 {%k1} {z}
4266; X86-NEXT:    retl
4267;
4268; X64-LABEL: test_mm_maskz_permutex2var_epi64:
4269; X64:       # %bb.0: # %entry
4270; X64-NEXT:    kmovw %edi, %k1
4271; X64-NEXT:    vpermt2q %xmm2, %xmm1, %xmm0 {%k1} {z}
4272; X64-NEXT:    retq
4273entry:
4274  %0 = tail call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> %__A, <2 x i64> %__I, <2 x i64> %__B)
4275  %1 = bitcast i8 %__U to <8 x i1>
4276  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
4277  %2 = select <2 x i1> %extract.i, <2 x i64> %0, <2 x i64> zeroinitializer
4278  ret <2 x i64> %2
4279}
4280
4281define <4 x i64> @test_mm256_permutex2var_epi64(<4 x i64> %__A, <4 x i64> %__I, <4 x i64> %__B) {
4282; CHECK-LABEL: test_mm256_permutex2var_epi64:
4283; CHECK:       # %bb.0: # %entry
4284; CHECK-NEXT:    vpermt2q %ymm2, %ymm1, %ymm0
4285; CHECK-NEXT:    ret{{[l|q]}}
4286entry:
4287  %0 = tail call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> %__A, <4 x i64> %__I, <4 x i64> %__B)
4288  ret <4 x i64> %0
4289}
4290
4291define <4 x i64> @test_mm256_mask_permutex2var_epi64(<4 x i64> %__A, i8 zeroext %__U, <4 x i64> %__I, <4 x i64> %__B) {
4292; X86-LABEL: test_mm256_mask_permutex2var_epi64:
4293; X86:       # %bb.0: # %entry
4294; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
4295; X86-NEXT:    kmovw %eax, %k1
4296; X86-NEXT:    vpermt2q %ymm2, %ymm1, %ymm0 {%k1}
4297; X86-NEXT:    retl
4298;
4299; X64-LABEL: test_mm256_mask_permutex2var_epi64:
4300; X64:       # %bb.0: # %entry
4301; X64-NEXT:    kmovw %edi, %k1
4302; X64-NEXT:    vpermt2q %ymm2, %ymm1, %ymm0 {%k1}
4303; X64-NEXT:    retq
4304entry:
4305  %0 = tail call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> %__A, <4 x i64> %__I, <4 x i64> %__B)
4306  %1 = bitcast i8 %__U to <8 x i1>
4307  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
4308  %2 = select <4 x i1> %extract.i, <4 x i64> %0, <4 x i64> %__A
4309  ret <4 x i64> %2
4310}
4311
4312define <4 x i64> @test_mm256_maskz_permutex2var_epi64(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__I, <4 x i64> %__B) {
4313; X86-LABEL: test_mm256_maskz_permutex2var_epi64:
4314; X86:       # %bb.0: # %entry
4315; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
4316; X86-NEXT:    kmovw %eax, %k1
4317; X86-NEXT:    vpermt2q %ymm2, %ymm1, %ymm0 {%k1} {z}
4318; X86-NEXT:    retl
4319;
4320; X64-LABEL: test_mm256_maskz_permutex2var_epi64:
4321; X64:       # %bb.0: # %entry
4322; X64-NEXT:    kmovw %edi, %k1
4323; X64-NEXT:    vpermt2q %ymm2, %ymm1, %ymm0 {%k1} {z}
4324; X64-NEXT:    retq
4325entry:
4326  %0 = tail call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> %__A, <4 x i64> %__I, <4 x i64> %__B)
4327  %1 = bitcast i8 %__U to <8 x i1>
4328  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
4329  %2 = select <4 x i1> %extract.i, <4 x i64> %0, <4 x i64> zeroinitializer
4330  ret <4 x i64> %2
4331}
4332
4333
4334define <2 x double> @test_mm_mask_fmadd_pd(<2 x double> %__A, i8 zeroext %__U, <2 x double> %__B, <2 x double> %__C) {
4335; X86-LABEL: test_mm_mask_fmadd_pd:
4336; X86:       # %bb.0: # %entry
4337; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
4338; X86-NEXT:    kmovw %eax, %k1
4339; X86-NEXT:    vfmadd132pd {{.*#+}} xmm0 {%k1} = (xmm0 * xmm1) + xmm2
4340; X86-NEXT:    retl
4341;
4342; X64-LABEL: test_mm_mask_fmadd_pd:
4343; X64:       # %bb.0: # %entry
4344; X64-NEXT:    kmovw %edi, %k1
4345; X64-NEXT:    vfmadd132pd {{.*#+}} xmm0 {%k1} = (xmm0 * xmm1) + xmm2
4346; X64-NEXT:    retq
4347entry:
4348  %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C) #9
4349  %1 = bitcast i8 %__U to <8 x i1>
4350  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
4351  %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> %__A
4352  ret <2 x double> %2
4353}
4354
4355define <2 x double> @test_mm_mask_fmsub_pd(<2 x double> %__A, i8 zeroext %__U, <2 x double> %__B, <2 x double> %__C) {
4356; X86-LABEL: test_mm_mask_fmsub_pd:
4357; X86:       # %bb.0: # %entry
4358; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
4359; X86-NEXT:    kmovw %eax, %k1
4360; X86-NEXT:    vfmsub132pd {{.*#+}} xmm0 {%k1} = (xmm0 * xmm1) - xmm2
4361; X86-NEXT:    retl
4362;
4363; X64-LABEL: test_mm_mask_fmsub_pd:
4364; X64:       # %bb.0: # %entry
4365; X64-NEXT:    kmovw %edi, %k1
4366; X64-NEXT:    vfmsub132pd {{.*#+}} xmm0 {%k1} = (xmm0 * xmm1) - xmm2
4367; X64-NEXT:    retq
4368entry:
4369  %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__C
4370  %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %sub.i) #9
4371  %1 = bitcast i8 %__U to <8 x i1>
4372  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
4373  %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> %__A
4374  ret <2 x double> %2
4375}
4376
4377define <2 x double> @test_mm_mask3_fmadd_pd(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C, i8 zeroext %__U) {
4378; X86-LABEL: test_mm_mask3_fmadd_pd:
4379; X86:       # %bb.0: # %entry
4380; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
4381; X86-NEXT:    kmovw %eax, %k1
4382; X86-NEXT:    vfmadd231pd {{.*#+}} xmm2 {%k1} = (xmm0 * xmm1) + xmm2
4383; X86-NEXT:    vmovapd %xmm2, %xmm0
4384; X86-NEXT:    retl
4385;
4386; X64-LABEL: test_mm_mask3_fmadd_pd:
4387; X64:       # %bb.0: # %entry
4388; X64-NEXT:    kmovw %edi, %k1
4389; X64-NEXT:    vfmadd231pd {{.*#+}} xmm2 {%k1} = (xmm0 * xmm1) + xmm2
4390; X64-NEXT:    vmovapd %xmm2, %xmm0
4391; X64-NEXT:    retq
4392entry:
4393  %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C) #9
4394  %1 = bitcast i8 %__U to <8 x i1>
4395  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
4396  %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> %__C
4397  ret <2 x double> %2
4398}
4399
4400define <2 x double> @test_mm_mask3_fnmadd_pd(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C, i8 zeroext %__U) {
4401; X86-LABEL: test_mm_mask3_fnmadd_pd:
4402; X86:       # %bb.0: # %entry
4403; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
4404; X86-NEXT:    kmovw %eax, %k1
4405; X86-NEXT:    vfnmadd231pd {{.*#+}} xmm2 {%k1} = -(xmm0 * xmm1) + xmm2
4406; X86-NEXT:    vmovapd %xmm2, %xmm0
4407; X86-NEXT:    retl
4408;
4409; X64-LABEL: test_mm_mask3_fnmadd_pd:
4410; X64:       # %bb.0: # %entry
4411; X64-NEXT:    kmovw %edi, %k1
4412; X64-NEXT:    vfnmadd231pd {{.*#+}} xmm2 {%k1} = -(xmm0 * xmm1) + xmm2
4413; X64-NEXT:    vmovapd %xmm2, %xmm0
4414; X64-NEXT:    retq
4415entry:
4416  %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__A
4417  %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %sub.i, <2 x double> %__B, <2 x double> %__C) #9
4418  %1 = bitcast i8 %__U to <8 x i1>
4419  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
4420  %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> %__C
4421  ret <2 x double> %2
4422}
4423
4424define <2 x double> @test_mm_maskz_fmadd_pd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) {
4425; X86-LABEL: test_mm_maskz_fmadd_pd:
4426; X86:       # %bb.0: # %entry
4427; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
4428; X86-NEXT:    kmovw %eax, %k1
4429; X86-NEXT:    vfmadd213pd {{.*#+}} xmm0 {%k1} {z} = (xmm1 * xmm0) + xmm2
4430; X86-NEXT:    retl
4431;
4432; X64-LABEL: test_mm_maskz_fmadd_pd:
4433; X64:       # %bb.0: # %entry
4434; X64-NEXT:    kmovw %edi, %k1
4435; X64-NEXT:    vfmadd213pd {{.*#+}} xmm0 {%k1} {z} = (xmm1 * xmm0) + xmm2
4436; X64-NEXT:    retq
4437entry:
4438  %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C) #9
4439  %1 = bitcast i8 %__U to <8 x i1>
4440  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
4441  %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> zeroinitializer
4442  ret <2 x double> %2
4443}
4444
4445define <2 x double> @test_mm_maskz_fmsub_pd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) {
4446; X86-LABEL: test_mm_maskz_fmsub_pd:
4447; X86:       # %bb.0: # %entry
4448; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
4449; X86-NEXT:    kmovw %eax, %k1
4450; X86-NEXT:    vfmsub213pd {{.*#+}} xmm0 {%k1} {z} = (xmm1 * xmm0) - xmm2
4451; X86-NEXT:    retl
4452;
4453; X64-LABEL: test_mm_maskz_fmsub_pd:
4454; X64:       # %bb.0: # %entry
4455; X64-NEXT:    kmovw %edi, %k1
4456; X64-NEXT:    vfmsub213pd {{.*#+}} xmm0 {%k1} {z} = (xmm1 * xmm0) - xmm2
4457; X64-NEXT:    retq
4458entry:
4459  %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__C
4460  %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %sub.i) #9
4461  %1 = bitcast i8 %__U to <8 x i1>
4462  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
4463  %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> zeroinitializer
4464  ret <2 x double> %2
4465}
4466
4467define <2 x double> @test_mm_maskz_fnmadd_pd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) {
4468; X86-LABEL: test_mm_maskz_fnmadd_pd:
4469; X86:       # %bb.0: # %entry
4470; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
4471; X86-NEXT:    kmovw %eax, %k1
4472; X86-NEXT:    vfnmadd213pd {{.*#+}} xmm0 {%k1} {z} = -(xmm1 * xmm0) + xmm2
4473; X86-NEXT:    retl
4474;
4475; X64-LABEL: test_mm_maskz_fnmadd_pd:
4476; X64:       # %bb.0: # %entry
4477; X64-NEXT:    kmovw %edi, %k1
4478; X64-NEXT:    vfnmadd213pd {{.*#+}} xmm0 {%k1} {z} = -(xmm1 * xmm0) + xmm2
4479; X64-NEXT:    retq
4480entry:
4481  %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__A
4482  %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %sub.i, <2 x double> %__B, <2 x double> %__C) #9
4483  %1 = bitcast i8 %__U to <8 x i1>
4484  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
4485  %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> zeroinitializer
4486  ret <2 x double> %2
4487}
4488
4489define <2 x double> @test_mm_maskz_fnmsub_pd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) {
4490; X86-LABEL: test_mm_maskz_fnmsub_pd:
4491; X86:       # %bb.0: # %entry
4492; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
4493; X86-NEXT:    kmovw %eax, %k1
4494; X86-NEXT:    vfnmsub213pd {{.*#+}} xmm0 {%k1} {z} = -(xmm1 * xmm0) - xmm2
4495; X86-NEXT:    retl
4496;
4497; X64-LABEL: test_mm_maskz_fnmsub_pd:
4498; X64:       # %bb.0: # %entry
4499; X64-NEXT:    kmovw %edi, %k1
4500; X64-NEXT:    vfnmsub213pd {{.*#+}} xmm0 {%k1} {z} = -(xmm1 * xmm0) - xmm2
4501; X64-NEXT:    retq
4502entry:
4503  %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__A
4504  %sub1.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__C
4505  %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %sub.i, <2 x double> %__B, <2 x double> %sub1.i) #9
4506  %1 = bitcast i8 %__U to <8 x i1>
4507  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
4508  %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> zeroinitializer
4509  ret <2 x double> %2
4510}
4511
4512define <4 x double> @test_mm256_mask_fmadd_pd(<4 x double> %__A, i8 zeroext %__U, <4 x double> %__B, <4 x double> %__C) {
4513; X86-LABEL: test_mm256_mask_fmadd_pd:
4514; X86:       # %bb.0: # %entry
4515; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
4516; X86-NEXT:    kmovw %eax, %k1
4517; X86-NEXT:    vfmadd132pd {{.*#+}} ymm0 {%k1} = (ymm0 * ymm1) + ymm2
4518; X86-NEXT:    retl
4519;
4520; X64-LABEL: test_mm256_mask_fmadd_pd:
4521; X64:       # %bb.0: # %entry
4522; X64-NEXT:    kmovw %edi, %k1
4523; X64-NEXT:    vfmadd132pd {{.*#+}} ymm0 {%k1} = (ymm0 * ymm1) + ymm2
4524; X64-NEXT:    retq
4525entry:
4526  %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C) #9
4527  %1 = bitcast i8 %__U to <8 x i1>
4528  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
4529  %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> %__A
4530  ret <4 x double> %2
4531}
4532
4533define <4 x double> @test_mm256_mask_fmsub_pd(<4 x double> %__A, i8 zeroext %__U, <4 x double> %__B, <4 x double> %__C) {
4534; X86-LABEL: test_mm256_mask_fmsub_pd:
4535; X86:       # %bb.0: # %entry
4536; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
4537; X86-NEXT:    kmovw %eax, %k1
4538; X86-NEXT:    vfmsub132pd {{.*#+}} ymm0 {%k1} = (ymm0 * ymm1) - ymm2
4539; X86-NEXT:    retl
4540;
4541; X64-LABEL: test_mm256_mask_fmsub_pd:
4542; X64:       # %bb.0: # %entry
4543; X64-NEXT:    kmovw %edi, %k1
4544; X64-NEXT:    vfmsub132pd {{.*#+}} ymm0 {%k1} = (ymm0 * ymm1) - ymm2
4545; X64-NEXT:    retq
4546entry:
4547  %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
4548  %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %sub.i) #9
4549  %1 = bitcast i8 %__U to <8 x i1>
4550  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
4551  %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> %__A
4552  ret <4 x double> %2
4553}
4554
4555define <4 x double> @test_mm256_mask3_fmadd_pd(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C, i8 zeroext %__U) {
4556; X86-LABEL: test_mm256_mask3_fmadd_pd:
4557; X86:       # %bb.0: # %entry
4558; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
4559; X86-NEXT:    kmovw %eax, %k1
4560; X86-NEXT:    vfmadd231pd {{.*#+}} ymm2 {%k1} = (ymm0 * ymm1) + ymm2
4561; X86-NEXT:    vmovapd %ymm2, %ymm0
4562; X86-NEXT:    retl
4563;
4564; X64-LABEL: test_mm256_mask3_fmadd_pd:
4565; X64:       # %bb.0: # %entry
4566; X64-NEXT:    kmovw %edi, %k1
4567; X64-NEXT:    vfmadd231pd {{.*#+}} ymm2 {%k1} = (ymm0 * ymm1) + ymm2
4568; X64-NEXT:    vmovapd %ymm2, %ymm0
4569; X64-NEXT:    retq
4570entry:
4571  %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C) #9
4572  %1 = bitcast i8 %__U to <8 x i1>
4573  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
4574  %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> %__C
4575  ret <4 x double> %2
4576}
4577
4578define <4 x double> @test_mm256_mask3_fnmadd_pd(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C, i8 zeroext %__U) {
4579; X86-LABEL: test_mm256_mask3_fnmadd_pd:
4580; X86:       # %bb.0: # %entry
4581; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
4582; X86-NEXT:    kmovw %eax, %k1
4583; X86-NEXT:    vfnmadd231pd {{.*#+}} ymm2 {%k1} = -(ymm0 * ymm1) + ymm2
4584; X86-NEXT:    vmovapd %ymm2, %ymm0
4585; X86-NEXT:    retl
4586;
4587; X64-LABEL: test_mm256_mask3_fnmadd_pd:
4588; X64:       # %bb.0: # %entry
4589; X64-NEXT:    kmovw %edi, %k1
4590; X64-NEXT:    vfnmadd231pd {{.*#+}} ymm2 {%k1} = -(ymm0 * ymm1) + ymm2
4591; X64-NEXT:    vmovapd %ymm2, %ymm0
4592; X64-NEXT:    retq
4593entry:
4594  %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A
4595  %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %sub.i, <4 x double> %__B, <4 x double> %__C) #9
4596  %1 = bitcast i8 %__U to <8 x i1>
4597  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
4598  %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> %__C
4599  ret <4 x double> %2
4600}
4601
4602define <4 x double> @test_mm256_maskz_fmadd_pd(i8 zeroext %__U, <4 x double> %__A, <4 x double> %__B, <4 x double> %__C) {
4603; X86-LABEL: test_mm256_maskz_fmadd_pd:
4604; X86:       # %bb.0: # %entry
4605; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
4606; X86-NEXT:    kmovw %eax, %k1
4607; X86-NEXT:    vfmadd213pd {{.*#+}} ymm0 {%k1} {z} = (ymm1 * ymm0) + ymm2
4608; X86-NEXT:    retl
4609;
4610; X64-LABEL: test_mm256_maskz_fmadd_pd:
4611; X64:       # %bb.0: # %entry
4612; X64-NEXT:    kmovw %edi, %k1
4613; X64-NEXT:    vfmadd213pd {{.*#+}} ymm0 {%k1} {z} = (ymm1 * ymm0) + ymm2
4614; X64-NEXT:    retq
4615entry:
4616  %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C) #9
4617  %1 = bitcast i8 %__U to <8 x i1>
4618  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
4619  %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> zeroinitializer
4620  ret <4 x double> %2
4621}
4622
4623define <4 x double> @test_mm256_maskz_fmsub_pd(i8 zeroext %__U, <4 x double> %__A, <4 x double> %__B, <4 x double> %__C) {
4624; X86-LABEL: test_mm256_maskz_fmsub_pd:
4625; X86:       # %bb.0: # %entry
4626; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
4627; X86-NEXT:    kmovw %eax, %k1
4628; X86-NEXT:    vfmsub213pd {{.*#+}} ymm0 {%k1} {z} = (ymm1 * ymm0) - ymm2
4629; X86-NEXT:    retl
4630;
4631; X64-LABEL: test_mm256_maskz_fmsub_pd:
4632; X64:       # %bb.0: # %entry
4633; X64-NEXT:    kmovw %edi, %k1
4634; X64-NEXT:    vfmsub213pd {{.*#+}} ymm0 {%k1} {z} = (ymm1 * ymm0) - ymm2
4635; X64-NEXT:    retq
4636entry:
4637  %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
4638  %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %sub.i) #9
4639  %1 = bitcast i8 %__U to <8 x i1>
4640  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
4641  %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> zeroinitializer
4642  ret <4 x double> %2
4643}
4644
4645define <4 x double> @test_mm256_maskz_fnmadd_pd(i8 zeroext %__U, <4 x double> %__A, <4 x double> %__B, <4 x double> %__C) {
4646; X86-LABEL: test_mm256_maskz_fnmadd_pd:
4647; X86:       # %bb.0: # %entry
4648; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
4649; X86-NEXT:    kmovw %eax, %k1
4650; X86-NEXT:    vfnmadd213pd {{.*#+}} ymm0 {%k1} {z} = -(ymm1 * ymm0) + ymm2
4651; X86-NEXT:    retl
4652;
4653; X64-LABEL: test_mm256_maskz_fnmadd_pd:
4654; X64:       # %bb.0: # %entry
4655; X64-NEXT:    kmovw %edi, %k1
4656; X64-NEXT:    vfnmadd213pd {{.*#+}} ymm0 {%k1} {z} = -(ymm1 * ymm0) + ymm2
4657; X64-NEXT:    retq
4658entry:
4659  %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A
4660  %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %sub.i, <4 x double> %__B, <4 x double> %__C) #9
4661  %1 = bitcast i8 %__U to <8 x i1>
4662  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
4663  %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> zeroinitializer
4664  ret <4 x double> %2
4665}
4666
4667define <4 x double> @test_mm256_maskz_fnmsub_pd(i8 zeroext %__U, <4 x double> %__A, <4 x double> %__B, <4 x double> %__C) {
4668; X86-LABEL: test_mm256_maskz_fnmsub_pd:
4669; X86:       # %bb.0: # %entry
4670; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
4671; X86-NEXT:    kmovw %eax, %k1
4672; X86-NEXT:    vfnmsub213pd {{.*#+}} ymm0 {%k1} {z} = -(ymm1 * ymm0) - ymm2
4673; X86-NEXT:    retl
4674;
4675; X64-LABEL: test_mm256_maskz_fnmsub_pd:
4676; X64:       # %bb.0: # %entry
4677; X64-NEXT:    kmovw %edi, %k1
4678; X64-NEXT:    vfnmsub213pd {{.*#+}} ymm0 {%k1} {z} = -(ymm1 * ymm0) - ymm2
4679; X64-NEXT:    retq
4680entry:
4681  %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A
4682  %sub1.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
4683  %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %sub.i, <4 x double> %__B, <4 x double> %sub1.i) #9
4684  %1 = bitcast i8 %__U to <8 x i1>
4685  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
4686  %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> zeroinitializer
4687  ret <4 x double> %2
4688}
4689
4690define <4 x float> @test_mm_mask_fmadd_ps(<4 x float> %__A, i8 zeroext %__U, <4 x float> %__B, <4 x float> %__C) {
4691; X86-LABEL: test_mm_mask_fmadd_ps:
4692; X86:       # %bb.0: # %entry
4693; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
4694; X86-NEXT:    kmovw %eax, %k1
4695; X86-NEXT:    vfmadd132ps {{.*#+}} xmm0 {%k1} = (xmm0 * xmm1) + xmm2
4696; X86-NEXT:    retl
4697;
4698; X64-LABEL: test_mm_mask_fmadd_ps:
4699; X64:       # %bb.0: # %entry
4700; X64-NEXT:    kmovw %edi, %k1
4701; X64-NEXT:    vfmadd132ps {{.*#+}} xmm0 {%k1} = (xmm0 * xmm1) + xmm2
4702; X64-NEXT:    retq
4703entry:
4704  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C) #9
4705  %1 = bitcast i8 %__U to <8 x i1>
4706  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
4707  %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> %__A
4708  ret <4 x float> %2
4709}
4710
4711define <4 x float> @test_mm_mask_fmsub_ps(<4 x float> %__A, i8 zeroext %__U, <4 x float> %__B, <4 x float> %__C) {
4712; X86-LABEL: test_mm_mask_fmsub_ps:
4713; X86:       # %bb.0: # %entry
4714; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
4715; X86-NEXT:    kmovw %eax, %k1
4716; X86-NEXT:    vfmsub132ps {{.*#+}} xmm0 {%k1} = (xmm0 * xmm1) - xmm2
4717; X86-NEXT:    retl
4718;
4719; X64-LABEL: test_mm_mask_fmsub_ps:
4720; X64:       # %bb.0: # %entry
4721; X64-NEXT:    kmovw %edi, %k1
4722; X64-NEXT:    vfmsub132ps {{.*#+}} xmm0 {%k1} = (xmm0 * xmm1) - xmm2
4723; X64-NEXT:    retq
4724entry:
4725  %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4726  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %sub.i) #9
4727  %1 = bitcast i8 %__U to <8 x i1>
4728  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
4729  %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> %__A
4730  ret <4 x float> %2
4731}
4732
4733define <4 x float> @test_mm_mask3_fmadd_ps(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C, i8 zeroext %__U) {
4734; X86-LABEL: test_mm_mask3_fmadd_ps:
4735; X86:       # %bb.0: # %entry
4736; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
4737; X86-NEXT:    kmovw %eax, %k1
4738; X86-NEXT:    vfmadd231ps {{.*#+}} xmm2 {%k1} = (xmm0 * xmm1) + xmm2
4739; X86-NEXT:    vmovaps %xmm2, %xmm0
4740; X86-NEXT:    retl
4741;
4742; X64-LABEL: test_mm_mask3_fmadd_ps:
4743; X64:       # %bb.0: # %entry
4744; X64-NEXT:    kmovw %edi, %k1
4745; X64-NEXT:    vfmadd231ps {{.*#+}} xmm2 {%k1} = (xmm0 * xmm1) + xmm2
4746; X64-NEXT:    vmovaps %xmm2, %xmm0
4747; X64-NEXT:    retq
4748entry:
4749  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C) #9
4750  %1 = bitcast i8 %__U to <8 x i1>
4751  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
4752  %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> %__C
4753  ret <4 x float> %2
4754}
4755
4756define <4 x float> @test_mm_mask3_fnmadd_ps(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C, i8 zeroext %__U) {
4757; X86-LABEL: test_mm_mask3_fnmadd_ps:
4758; X86:       # %bb.0: # %entry
4759; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
4760; X86-NEXT:    kmovw %eax, %k1
4761; X86-NEXT:    vfnmadd231ps {{.*#+}} xmm2 {%k1} = -(xmm0 * xmm1) + xmm2
4762; X86-NEXT:    vmovaps %xmm2, %xmm0
4763; X86-NEXT:    retl
4764;
4765; X64-LABEL: test_mm_mask3_fnmadd_ps:
4766; X64:       # %bb.0: # %entry
4767; X64-NEXT:    kmovw %edi, %k1
4768; X64-NEXT:    vfnmadd231ps {{.*#+}} xmm2 {%k1} = -(xmm0 * xmm1) + xmm2
4769; X64-NEXT:    vmovaps %xmm2, %xmm0
4770; X64-NEXT:    retq
4771entry:
4772  %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
4773  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %sub.i, <4 x float> %__B, <4 x float> %__C) #9
4774  %1 = bitcast i8 %__U to <8 x i1>
4775  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
4776  %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> %__C
4777  ret <4 x float> %2
4778}
4779
4780define <4 x float> @test_mm_maskz_fmadd_ps(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) {
4781; X86-LABEL: test_mm_maskz_fmadd_ps:
4782; X86:       # %bb.0: # %entry
4783; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
4784; X86-NEXT:    kmovw %eax, %k1
4785; X86-NEXT:    vfmadd213ps {{.*#+}} xmm0 {%k1} {z} = (xmm1 * xmm0) + xmm2
4786; X86-NEXT:    retl
4787;
4788; X64-LABEL: test_mm_maskz_fmadd_ps:
4789; X64:       # %bb.0: # %entry
4790; X64-NEXT:    kmovw %edi, %k1
4791; X64-NEXT:    vfmadd213ps {{.*#+}} xmm0 {%k1} {z} = (xmm1 * xmm0) + xmm2
4792; X64-NEXT:    retq
4793entry:
4794  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C) #9
4795  %1 = bitcast i8 %__U to <8 x i1>
4796  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
4797  %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> zeroinitializer
4798  ret <4 x float> %2
4799}
4800
4801define <4 x float> @test_mm_maskz_fmsub_ps(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) {
4802; X86-LABEL: test_mm_maskz_fmsub_ps:
4803; X86:       # %bb.0: # %entry
4804; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
4805; X86-NEXT:    kmovw %eax, %k1
4806; X86-NEXT:    vfmsub213ps {{.*#+}} xmm0 {%k1} {z} = (xmm1 * xmm0) - xmm2
4807; X86-NEXT:    retl
4808;
4809; X64-LABEL: test_mm_maskz_fmsub_ps:
4810; X64:       # %bb.0: # %entry
4811; X64-NEXT:    kmovw %edi, %k1
4812; X64-NEXT:    vfmsub213ps {{.*#+}} xmm0 {%k1} {z} = (xmm1 * xmm0) - xmm2
4813; X64-NEXT:    retq
4814entry:
4815  %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4816  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %sub.i) #9
4817  %1 = bitcast i8 %__U to <8 x i1>
4818  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
4819  %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> zeroinitializer
4820  ret <4 x float> %2
4821}
4822
4823define <4 x float> @test_mm_maskz_fnmadd_ps(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) {
4824; X86-LABEL: test_mm_maskz_fnmadd_ps:
4825; X86:       # %bb.0: # %entry
4826; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
4827; X86-NEXT:    kmovw %eax, %k1
4828; X86-NEXT:    vfnmadd213ps {{.*#+}} xmm0 {%k1} {z} = -(xmm1 * xmm0) + xmm2
4829; X86-NEXT:    retl
4830;
4831; X64-LABEL: test_mm_maskz_fnmadd_ps:
4832; X64:       # %bb.0: # %entry
4833; X64-NEXT:    kmovw %edi, %k1
4834; X64-NEXT:    vfnmadd213ps {{.*#+}} xmm0 {%k1} {z} = -(xmm1 * xmm0) + xmm2
4835; X64-NEXT:    retq
4836entry:
4837  %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
4838  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %sub.i, <4 x float> %__B, <4 x float> %__C) #9
4839  %1 = bitcast i8 %__U to <8 x i1>
4840  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
4841  %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> zeroinitializer
4842  ret <4 x float> %2
4843}
4844
4845define <4 x float> @test_mm_maskz_fnmsub_ps(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) {
4846; X86-LABEL: test_mm_maskz_fnmsub_ps:
4847; X86:       # %bb.0: # %entry
4848; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
4849; X86-NEXT:    kmovw %eax, %k1
4850; X86-NEXT:    vfnmsub213ps {{.*#+}} xmm0 {%k1} {z} = -(xmm1 * xmm0) - xmm2
4851; X86-NEXT:    retl
4852;
4853; X64-LABEL: test_mm_maskz_fnmsub_ps:
4854; X64:       # %bb.0: # %entry
4855; X64-NEXT:    kmovw %edi, %k1
4856; X64-NEXT:    vfnmsub213ps {{.*#+}} xmm0 {%k1} {z} = -(xmm1 * xmm0) - xmm2
4857; X64-NEXT:    retq
4858entry:
4859  %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
4860  %sub1.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4861  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %sub.i, <4 x float> %__B, <4 x float> %sub1.i) #9
4862  %1 = bitcast i8 %__U to <8 x i1>
4863  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
4864  %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> zeroinitializer
4865  ret <4 x float> %2
4866}
4867
4868define <8 x float> @test_mm256_mask_fmadd_ps(<8 x float> %__A, i8 zeroext %__U, <8 x float> %__B, <8 x float> %__C) {
4869; X86-LABEL: test_mm256_mask_fmadd_ps:
4870; X86:       # %bb.0: # %entry
4871; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
4872; X86-NEXT:    kmovw %eax, %k1
4873; X86-NEXT:    vfmadd132ps {{.*#+}} ymm0 {%k1} = (ymm0 * ymm1) + ymm2
4874; X86-NEXT:    retl
4875;
4876; X64-LABEL: test_mm256_mask_fmadd_ps:
4877; X64:       # %bb.0: # %entry
4878; X64-NEXT:    kmovw %edi, %k1
4879; X64-NEXT:    vfmadd132ps {{.*#+}} ymm0 {%k1} = (ymm0 * ymm1) + ymm2
4880; X64-NEXT:    retq
4881entry:
4882  %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C) #9
4883  %1 = bitcast i8 %__U to <8 x i1>
4884  %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> %__A
4885  ret <8 x float> %2
4886}
4887
4888define <8 x float> @test_mm256_mask_fmsub_ps(<8 x float> %__A, i8 zeroext %__U, <8 x float> %__B, <8 x float> %__C) {
4889; X86-LABEL: test_mm256_mask_fmsub_ps:
4890; X86:       # %bb.0: # %entry
4891; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
4892; X86-NEXT:    kmovw %eax, %k1
4893; X86-NEXT:    vfmsub132ps {{.*#+}} ymm0 {%k1} = (ymm0 * ymm1) - ymm2
4894; X86-NEXT:    retl
4895;
4896; X64-LABEL: test_mm256_mask_fmsub_ps:
4897; X64:       # %bb.0: # %entry
4898; X64-NEXT:    kmovw %edi, %k1
4899; X64-NEXT:    vfmsub132ps {{.*#+}} ymm0 {%k1} = (ymm0 * ymm1) - ymm2
4900; X64-NEXT:    retq
4901entry:
4902  %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4903  %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %sub.i) #9
4904  %1 = bitcast i8 %__U to <8 x i1>
4905  %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> %__A
4906  ret <8 x float> %2
4907}
4908
4909define <8 x float> @test_mm256_mask3_fmadd_ps(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C, i8 zeroext %__U) {
4910; X86-LABEL: test_mm256_mask3_fmadd_ps:
4911; X86:       # %bb.0: # %entry
4912; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
4913; X86-NEXT:    kmovw %eax, %k1
4914; X86-NEXT:    vfmadd231ps {{.*#+}} ymm2 {%k1} = (ymm0 * ymm1) + ymm2
4915; X86-NEXT:    vmovaps %ymm2, %ymm0
4916; X86-NEXT:    retl
4917;
4918; X64-LABEL: test_mm256_mask3_fmadd_ps:
4919; X64:       # %bb.0: # %entry
4920; X64-NEXT:    kmovw %edi, %k1
4921; X64-NEXT:    vfmadd231ps {{.*#+}} ymm2 {%k1} = (ymm0 * ymm1) + ymm2
4922; X64-NEXT:    vmovaps %ymm2, %ymm0
4923; X64-NEXT:    retq
4924entry:
4925  %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C) #9
4926  %1 = bitcast i8 %__U to <8 x i1>
4927  %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> %__C
4928  ret <8 x float> %2
4929}
4930
4931define <8 x float> @test_mm256_mask3_fnmadd_ps(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C, i8 zeroext %__U) {
4932; X86-LABEL: test_mm256_mask3_fnmadd_ps:
4933; X86:       # %bb.0: # %entry
4934; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
4935; X86-NEXT:    kmovw %eax, %k1
4936; X86-NEXT:    vfnmadd231ps {{.*#+}} ymm2 {%k1} = -(ymm0 * ymm1) + ymm2
4937; X86-NEXT:    vmovaps %ymm2, %ymm0
4938; X86-NEXT:    retl
4939;
4940; X64-LABEL: test_mm256_mask3_fnmadd_ps:
4941; X64:       # %bb.0: # %entry
4942; X64-NEXT:    kmovw %edi, %k1
4943; X64-NEXT:    vfnmadd231ps {{.*#+}} ymm2 {%k1} = -(ymm0 * ymm1) + ymm2
4944; X64-NEXT:    vmovaps %ymm2, %ymm0
4945; X64-NEXT:    retq
4946entry:
4947  %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
4948  %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %sub.i, <8 x float> %__B, <8 x float> %__C) #9
4949  %1 = bitcast i8 %__U to <8 x i1>
4950  %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> %__C
4951  ret <8 x float> %2
4952}
4953
4954define <8 x float> @test_mm256_maskz_fmadd_ps(i8 zeroext %__U, <8 x float> %__A, <8 x float> %__B, <8 x float> %__C) {
4955; X86-LABEL: test_mm256_maskz_fmadd_ps:
4956; X86:       # %bb.0: # %entry
4957; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
4958; X86-NEXT:    kmovw %eax, %k1
4959; X86-NEXT:    vfmadd213ps {{.*#+}} ymm0 {%k1} {z} = (ymm1 * ymm0) + ymm2
4960; X86-NEXT:    retl
4961;
4962; X64-LABEL: test_mm256_maskz_fmadd_ps:
4963; X64:       # %bb.0: # %entry
4964; X64-NEXT:    kmovw %edi, %k1
4965; X64-NEXT:    vfmadd213ps {{.*#+}} ymm0 {%k1} {z} = (ymm1 * ymm0) + ymm2
4966; X64-NEXT:    retq
4967entry:
4968  %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C) #9
4969  %1 = bitcast i8 %__U to <8 x i1>
4970  %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> zeroinitializer
4971  ret <8 x float> %2
4972}
4973
4974define <8 x float> @test_mm256_maskz_fmsub_ps(i8 zeroext %__U, <8 x float> %__A, <8 x float> %__B, <8 x float> %__C) {
4975; X86-LABEL: test_mm256_maskz_fmsub_ps:
4976; X86:       # %bb.0: # %entry
4977; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
4978; X86-NEXT:    kmovw %eax, %k1
4979; X86-NEXT:    vfmsub213ps {{.*#+}} ymm0 {%k1} {z} = (ymm1 * ymm0) - ymm2
4980; X86-NEXT:    retl
4981;
4982; X64-LABEL: test_mm256_maskz_fmsub_ps:
4983; X64:       # %bb.0: # %entry
4984; X64-NEXT:    kmovw %edi, %k1
4985; X64-NEXT:    vfmsub213ps {{.*#+}} ymm0 {%k1} {z} = (ymm1 * ymm0) - ymm2
4986; X64-NEXT:    retq
4987entry:
4988  %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4989  %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %sub.i) #9
4990  %1 = bitcast i8 %__U to <8 x i1>
4991  %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> zeroinitializer
4992  ret <8 x float> %2
4993}
4994
4995define <8 x float> @test_mm256_maskz_fnmadd_ps(i8 zeroext %__U, <8 x float> %__A, <8 x float> %__B, <8 x float> %__C) {
4996; X86-LABEL: test_mm256_maskz_fnmadd_ps:
4997; X86:       # %bb.0: # %entry
4998; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
4999; X86-NEXT:    kmovw %eax, %k1
5000; X86-NEXT:    vfnmadd213ps {{.*#+}} ymm0 {%k1} {z} = -(ymm1 * ymm0) + ymm2
5001; X86-NEXT:    retl
5002;
5003; X64-LABEL: test_mm256_maskz_fnmadd_ps:
5004; X64:       # %bb.0: # %entry
5005; X64-NEXT:    kmovw %edi, %k1
5006; X64-NEXT:    vfnmadd213ps {{.*#+}} ymm0 {%k1} {z} = -(ymm1 * ymm0) + ymm2
5007; X64-NEXT:    retq
5008entry:
5009  %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
5010  %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %sub.i, <8 x float> %__B, <8 x float> %__C) #9
5011  %1 = bitcast i8 %__U to <8 x i1>
5012  %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> zeroinitializer
5013  ret <8 x float> %2
5014}
5015
5016define <8 x float> @test_mm256_maskz_fnmsub_ps(i8 zeroext %__U, <8 x float> %__A, <8 x float> %__B, <8 x float> %__C) {
5017; X86-LABEL: test_mm256_maskz_fnmsub_ps:
5018; X86:       # %bb.0: # %entry
5019; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
5020; X86-NEXT:    kmovw %eax, %k1
5021; X86-NEXT:    vfnmsub213ps {{.*#+}} ymm0 {%k1} {z} = -(ymm1 * ymm0) - ymm2
5022; X86-NEXT:    retl
5023;
5024; X64-LABEL: test_mm256_maskz_fnmsub_ps:
5025; X64:       # %bb.0: # %entry
5026; X64-NEXT:    kmovw %edi, %k1
5027; X64-NEXT:    vfnmsub213ps {{.*#+}} ymm0 {%k1} {z} = -(ymm1 * ymm0) - ymm2
5028; X64-NEXT:    retq
5029entry:
5030  %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
5031  %sub1.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
5032  %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %sub.i, <8 x float> %__B, <8 x float> %sub1.i) #9
5033  %1 = bitcast i8 %__U to <8 x i1>
5034  %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> zeroinitializer
5035  ret <8 x float> %2
5036}
5037
5038define <2 x double> @test_mm_mask_fmaddsub_pd(<2 x double> %__A, i8 zeroext %__U, <2 x double> %__B, <2 x double> %__C) {
5039; X86-LABEL: test_mm_mask_fmaddsub_pd:
5040; X86:       # %bb.0: # %entry
5041; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
5042; X86-NEXT:    kmovw %eax, %k1
5043; X86-NEXT:    vfmaddsub132pd {{.*#+}} xmm0 {%k1} = (xmm0 * xmm1) +/- xmm2
5044; X86-NEXT:    retl
5045;
5046; X64-LABEL: test_mm_mask_fmaddsub_pd:
5047; X64:       # %bb.0: # %entry
5048; X64-NEXT:    kmovw %edi, %k1
5049; X64-NEXT:    vfmaddsub132pd {{.*#+}} xmm0 {%k1} = (xmm0 * xmm1) +/- xmm2
5050; X64-NEXT:    retq
5051entry:
5052  %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C) #9
5053  %1 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__C
5054  %2 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %1) #9
5055  %3 = shufflevector <2 x double> %2, <2 x double> %0, <2 x i32> <i32 0, i32 3>
5056  %4 = bitcast i8 %__U to <8 x i1>
5057  %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
5058  %5 = select <2 x i1> %extract.i, <2 x double> %3, <2 x double> %__A
5059  ret <2 x double> %5
5060}
5061
5062define <2 x double> @test_mm_mask_fmsubadd_pd(<2 x double> %__A, i8 zeroext %__U, <2 x double> %__B, <2 x double> %__C) {
5063; X86-LABEL: test_mm_mask_fmsubadd_pd:
5064; X86:       # %bb.0: # %entry
5065; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
5066; X86-NEXT:    kmovw %eax, %k1
5067; X86-NEXT:    vfmsubadd132pd {{.*#+}} xmm0 {%k1} = (xmm0 * xmm1) -/+ xmm2
5068; X86-NEXT:    retl
5069;
5070; X64-LABEL: test_mm_mask_fmsubadd_pd:
5071; X64:       # %bb.0: # %entry
5072; X64-NEXT:    kmovw %edi, %k1
5073; X64-NEXT:    vfmsubadd132pd {{.*#+}} xmm0 {%k1} = (xmm0 * xmm1) -/+ xmm2
5074; X64-NEXT:    retq
5075entry:
5076  %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__C
5077  %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %sub.i) #9
5078  %1 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C) #9
5079  %2 = shufflevector <2 x double> %1, <2 x double> %0, <2 x i32> <i32 0, i32 3>
5080  %3 = bitcast i8 %__U to <8 x i1>
5081  %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
5082  %4 = select <2 x i1> %extract.i, <2 x double> %2, <2 x double> %__A
5083  ret <2 x double> %4
5084}
5085
5086define <2 x double> @test_mm_mask3_fmaddsub_pd(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C, i8 zeroext %__U) {
5087; X86-LABEL: test_mm_mask3_fmaddsub_pd:
5088; X86:       # %bb.0: # %entry
5089; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
5090; X86-NEXT:    kmovw %eax, %k1
5091; X86-NEXT:    vfmaddsub231pd {{.*#+}} xmm2 {%k1} = (xmm0 * xmm1) +/- xmm2
5092; X86-NEXT:    vmovapd %xmm2, %xmm0
5093; X86-NEXT:    retl
5094;
5095; X64-LABEL: test_mm_mask3_fmaddsub_pd:
5096; X64:       # %bb.0: # %entry
5097; X64-NEXT:    kmovw %edi, %k1
5098; X64-NEXT:    vfmaddsub231pd {{.*#+}} xmm2 {%k1} = (xmm0 * xmm1) +/- xmm2
5099; X64-NEXT:    vmovapd %xmm2, %xmm0
5100; X64-NEXT:    retq
5101entry:
5102  %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C) #9
5103  %1 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__C
5104  %2 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %1) #9
5105  %3 = shufflevector <2 x double> %2, <2 x double> %0, <2 x i32> <i32 0, i32 3>
5106  %4 = bitcast i8 %__U to <8 x i1>
5107  %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
5108  %5 = select <2 x i1> %extract.i, <2 x double> %3, <2 x double> %__C
5109  ret <2 x double> %5
5110}
5111
5112define <2 x double> @test_mm_maskz_fmaddsub_pd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) {
5113; X86-LABEL: test_mm_maskz_fmaddsub_pd:
5114; X86:       # %bb.0: # %entry
5115; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
5116; X86-NEXT:    kmovw %eax, %k1
5117; X86-NEXT:    vfmaddsub213pd {{.*#+}} xmm0 {%k1} {z} = (xmm1 * xmm0) +/- xmm2
5118; X86-NEXT:    retl
5119;
5120; X64-LABEL: test_mm_maskz_fmaddsub_pd:
5121; X64:       # %bb.0: # %entry
5122; X64-NEXT:    kmovw %edi, %k1
5123; X64-NEXT:    vfmaddsub213pd {{.*#+}} xmm0 {%k1} {z} = (xmm1 * xmm0) +/- xmm2
5124; X64-NEXT:    retq
5125entry:
5126  %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C) #9
5127  %1 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__C
5128  %2 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %1) #9
5129  %3 = shufflevector <2 x double> %2, <2 x double> %0, <2 x i32> <i32 0, i32 3>
5130  %4 = bitcast i8 %__U to <8 x i1>
5131  %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
5132  %5 = select <2 x i1> %extract.i, <2 x double> %3, <2 x double> zeroinitializer
5133  ret <2 x double> %5
5134}
5135
5136define <2 x double> @test_mm_maskz_fmsubadd_pd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) {
5137; X86-LABEL: test_mm_maskz_fmsubadd_pd:
5138; X86:       # %bb.0: # %entry
5139; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
5140; X86-NEXT:    kmovw %eax, %k1
5141; X86-NEXT:    vfmsubadd213pd {{.*#+}} xmm0 {%k1} {z} = (xmm1 * xmm0) -/+ xmm2
5142; X86-NEXT:    retl
5143;
5144; X64-LABEL: test_mm_maskz_fmsubadd_pd:
5145; X64:       # %bb.0: # %entry
5146; X64-NEXT:    kmovw %edi, %k1
5147; X64-NEXT:    vfmsubadd213pd {{.*#+}} xmm0 {%k1} {z} = (xmm1 * xmm0) -/+ xmm2
5148; X64-NEXT:    retq
5149entry:
5150  %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__C
5151  %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %sub.i) #9
5152  %1 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C) #9
5153  %2 = shufflevector <2 x double> %1, <2 x double> %0, <2 x i32> <i32 0, i32 3>
5154  %3 = bitcast i8 %__U to <8 x i1>
5155  %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
5156  %4 = select <2 x i1> %extract.i, <2 x double> %2, <2 x double> zeroinitializer
5157  ret <2 x double> %4
5158}
5159
5160define <4 x double> @test_mm256_mask_fmaddsub_pd(<4 x double> %__A, i8 zeroext %__U, <4 x double> %__B, <4 x double> %__C) {
5161; X86-LABEL: test_mm256_mask_fmaddsub_pd:
5162; X86:       # %bb.0: # %entry
5163; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
5164; X86-NEXT:    kmovw %eax, %k1
5165; X86-NEXT:    vfmaddsub132pd {{.*#+}} ymm0 {%k1} = (ymm0 * ymm1) +/- ymm2
5166; X86-NEXT:    retl
5167;
5168; X64-LABEL: test_mm256_mask_fmaddsub_pd:
5169; X64:       # %bb.0: # %entry
5170; X64-NEXT:    kmovw %edi, %k1
5171; X64-NEXT:    vfmaddsub132pd {{.*#+}} ymm0 {%k1} = (ymm0 * ymm1) +/- ymm2
5172; X64-NEXT:    retq
5173entry:
5174  %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C) #9
5175  %1 = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
5176  %2 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %1) #9
5177  %3 = shufflevector <4 x double> %2, <4 x double> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
5178  %4 = bitcast i8 %__U to <8 x i1>
5179  %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
5180  %5 = select <4 x i1> %extract.i, <4 x double> %3, <4 x double> %__A
5181  ret <4 x double> %5
5182}
5183
5184define <4 x double> @test_mm256_mask_fmsubadd_pd(<4 x double> %__A, i8 zeroext %__U, <4 x double> %__B, <4 x double> %__C) {
5185; X86-LABEL: test_mm256_mask_fmsubadd_pd:
5186; X86:       # %bb.0: # %entry
5187; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
5188; X86-NEXT:    kmovw %eax, %k1
5189; X86-NEXT:    vfmsubadd132pd {{.*#+}} ymm0 {%k1} = (ymm0 * ymm1) -/+ ymm2
5190; X86-NEXT:    retl
5191;
5192; X64-LABEL: test_mm256_mask_fmsubadd_pd:
5193; X64:       # %bb.0: # %entry
5194; X64-NEXT:    kmovw %edi, %k1
5195; X64-NEXT:    vfmsubadd132pd {{.*#+}} ymm0 {%k1} = (ymm0 * ymm1) -/+ ymm2
5196; X64-NEXT:    retq
5197entry:
5198  %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
5199  %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %sub.i) #9
5200  %1 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C) #9
5201  %2 = shufflevector <4 x double> %1, <4 x double> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
5202  %3 = bitcast i8 %__U to <8 x i1>
5203  %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
5204  %4 = select <4 x i1> %extract.i, <4 x double> %2, <4 x double> %__A
5205  ret <4 x double> %4
5206}
5207
5208define <4 x double> @test_mm256_mask3_fmaddsub_pd(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C, i8 zeroext %__U) {
5209; X86-LABEL: test_mm256_mask3_fmaddsub_pd:
5210; X86:       # %bb.0: # %entry
5211; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
5212; X86-NEXT:    kmovw %eax, %k1
5213; X86-NEXT:    vfmaddsub231pd {{.*#+}} ymm2 {%k1} = (ymm0 * ymm1) +/- ymm2
5214; X86-NEXT:    vmovapd %ymm2, %ymm0
5215; X86-NEXT:    retl
5216;
5217; X64-LABEL: test_mm256_mask3_fmaddsub_pd:
5218; X64:       # %bb.0: # %entry
5219; X64-NEXT:    kmovw %edi, %k1
5220; X64-NEXT:    vfmaddsub231pd {{.*#+}} ymm2 {%k1} = (ymm0 * ymm1) +/- ymm2
5221; X64-NEXT:    vmovapd %ymm2, %ymm0
5222; X64-NEXT:    retq
5223entry:
5224  %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C) #9
5225  %1 = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
5226  %2 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %1) #9
5227  %3 = shufflevector <4 x double> %2, <4 x double> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
5228  %4 = bitcast i8 %__U to <8 x i1>
5229  %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
5230  %5 = select <4 x i1> %extract.i, <4 x double> %3, <4 x double> %__C
5231  ret <4 x double> %5
5232}
5233
5234define <4 x double> @test_mm256_maskz_fmaddsub_pd(i8 zeroext %__U, <4 x double> %__A, <4 x double> %__B, <4 x double> %__C) {
5235; X86-LABEL: test_mm256_maskz_fmaddsub_pd:
5236; X86:       # %bb.0: # %entry
5237; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
5238; X86-NEXT:    kmovw %eax, %k1
5239; X86-NEXT:    vfmaddsub213pd {{.*#+}} ymm0 {%k1} {z} = (ymm1 * ymm0) +/- ymm2
5240; X86-NEXT:    retl
5241;
5242; X64-LABEL: test_mm256_maskz_fmaddsub_pd:
5243; X64:       # %bb.0: # %entry
5244; X64-NEXT:    kmovw %edi, %k1
5245; X64-NEXT:    vfmaddsub213pd {{.*#+}} ymm0 {%k1} {z} = (ymm1 * ymm0) +/- ymm2
5246; X64-NEXT:    retq
5247entry:
5248  %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C) #9
5249  %1 = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
5250  %2 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %1) #9
5251  %3 = shufflevector <4 x double> %2, <4 x double> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
5252  %4 = bitcast i8 %__U to <8 x i1>
5253  %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
5254  %5 = select <4 x i1> %extract.i, <4 x double> %3, <4 x double> zeroinitializer
5255  ret <4 x double> %5
5256}
5257
5258define <4 x double> @test_mm256_maskz_fmsubadd_pd(i8 zeroext %__U, <4 x double> %__A, <4 x double> %__B, <4 x double> %__C) {
5259; X86-LABEL: test_mm256_maskz_fmsubadd_pd:
5260; X86:       # %bb.0: # %entry
5261; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
5262; X86-NEXT:    kmovw %eax, %k1
5263; X86-NEXT:    vfmsubadd213pd {{.*#+}} ymm0 {%k1} {z} = (ymm1 * ymm0) -/+ ymm2
5264; X86-NEXT:    retl
5265;
5266; X64-LABEL: test_mm256_maskz_fmsubadd_pd:
5267; X64:       # %bb.0: # %entry
5268; X64-NEXT:    kmovw %edi, %k1
5269; X64-NEXT:    vfmsubadd213pd {{.*#+}} ymm0 {%k1} {z} = (ymm1 * ymm0) -/+ ymm2
5270; X64-NEXT:    retq
5271entry:
5272  %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
5273  %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %sub.i) #9
5274  %1 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C) #9
5275  %2 = shufflevector <4 x double> %1, <4 x double> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
5276  %3 = bitcast i8 %__U to <8 x i1>
5277  %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
5278  %4 = select <4 x i1> %extract.i, <4 x double> %2, <4 x double> zeroinitializer
5279  ret <4 x double> %4
5280}
5281
5282define <4 x float> @test_mm_mask_fmaddsub_ps(<4 x float> %__A, i8 zeroext %__U, <4 x float> %__B, <4 x float> %__C) {
5283; X86-LABEL: test_mm_mask_fmaddsub_ps:
5284; X86:       # %bb.0: # %entry
5285; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
5286; X86-NEXT:    kmovw %eax, %k1
5287; X86-NEXT:    vfmaddsub132ps {{.*#+}} xmm0 {%k1} = (xmm0 * xmm1) +/- xmm2
5288; X86-NEXT:    retl
5289;
5290; X64-LABEL: test_mm_mask_fmaddsub_ps:
5291; X64:       # %bb.0: # %entry
5292; X64-NEXT:    kmovw %edi, %k1
5293; X64-NEXT:    vfmaddsub132ps {{.*#+}} xmm0 {%k1} = (xmm0 * xmm1) +/- xmm2
5294; X64-NEXT:    retq
5295entry:
5296  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C) #9
5297  %1 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
5298  %2 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %1) #9
5299  %3 = shufflevector <4 x float> %2, <4 x float> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
5300  %4 = bitcast i8 %__U to <8 x i1>
5301  %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
5302  %5 = select <4 x i1> %extract.i, <4 x float> %3, <4 x float> %__A
5303  ret <4 x float> %5
5304}
5305
5306define <4 x float> @test_mm_mask_fmsubadd_ps(<4 x float> %__A, i8 zeroext %__U, <4 x float> %__B, <4 x float> %__C) {
5307; X86-LABEL: test_mm_mask_fmsubadd_ps:
5308; X86:       # %bb.0: # %entry
5309; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
5310; X86-NEXT:    kmovw %eax, %k1
5311; X86-NEXT:    vfmsubadd132ps {{.*#+}} xmm0 {%k1} = (xmm0 * xmm1) -/+ xmm2
5312; X86-NEXT:    retl
5313;
5314; X64-LABEL: test_mm_mask_fmsubadd_ps:
5315; X64:       # %bb.0: # %entry
5316; X64-NEXT:    kmovw %edi, %k1
5317; X64-NEXT:    vfmsubadd132ps {{.*#+}} xmm0 {%k1} = (xmm0 * xmm1) -/+ xmm2
5318; X64-NEXT:    retq
5319entry:
5320  %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
5321  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %sub.i) #9
5322  %1 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C) #9
5323  %2 = shufflevector <4 x float> %1, <4 x float> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
5324  %3 = bitcast i8 %__U to <8 x i1>
5325  %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
5326  %4 = select <4 x i1> %extract.i, <4 x float> %2, <4 x float> %__A
5327  ret <4 x float> %4
5328}
5329
5330define <4 x float> @test_mm_mask3_fmaddsub_ps(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C, i8 zeroext %__U) {
5331; X86-LABEL: test_mm_mask3_fmaddsub_ps:
5332; X86:       # %bb.0: # %entry
5333; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
5334; X86-NEXT:    kmovw %eax, %k1
5335; X86-NEXT:    vfmaddsub231ps {{.*#+}} xmm2 {%k1} = (xmm0 * xmm1) +/- xmm2
5336; X86-NEXT:    vmovaps %xmm2, %xmm0
5337; X86-NEXT:    retl
5338;
5339; X64-LABEL: test_mm_mask3_fmaddsub_ps:
5340; X64:       # %bb.0: # %entry
5341; X64-NEXT:    kmovw %edi, %k1
5342; X64-NEXT:    vfmaddsub231ps {{.*#+}} xmm2 {%k1} = (xmm0 * xmm1) +/- xmm2
5343; X64-NEXT:    vmovaps %xmm2, %xmm0
5344; X64-NEXT:    retq
5345entry:
5346  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C) #9
5347  %1 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
5348  %2 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %1) #9
5349  %3 = shufflevector <4 x float> %2, <4 x float> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
5350  %4 = bitcast i8 %__U to <8 x i1>
5351  %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
5352  %5 = select <4 x i1> %extract.i, <4 x float> %3, <4 x float> %__C
5353  ret <4 x float> %5
5354}
5355
5356define <4 x float> @test_mm_maskz_fmaddsub_ps(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) {
5357; X86-LABEL: test_mm_maskz_fmaddsub_ps:
5358; X86:       # %bb.0: # %entry
5359; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
5360; X86-NEXT:    kmovw %eax, %k1
5361; X86-NEXT:    vfmaddsub213ps {{.*#+}} xmm0 {%k1} {z} = (xmm1 * xmm0) +/- xmm2
5362; X86-NEXT:    retl
5363;
5364; X64-LABEL: test_mm_maskz_fmaddsub_ps:
5365; X64:       # %bb.0: # %entry
5366; X64-NEXT:    kmovw %edi, %k1
5367; X64-NEXT:    vfmaddsub213ps {{.*#+}} xmm0 {%k1} {z} = (xmm1 * xmm0) +/- xmm2
5368; X64-NEXT:    retq
5369entry:
5370  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C) #9
5371  %1 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
5372  %2 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %1) #9
5373  %3 = shufflevector <4 x float> %2, <4 x float> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
5374  %4 = bitcast i8 %__U to <8 x i1>
5375  %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
5376  %5 = select <4 x i1> %extract.i, <4 x float> %3, <4 x float> zeroinitializer
5377  ret <4 x float> %5
5378}
5379
5380define <4 x float> @test_mm_maskz_fmsubadd_ps(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) {
5381; X86-LABEL: test_mm_maskz_fmsubadd_ps:
5382; X86:       # %bb.0: # %entry
5383; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
5384; X86-NEXT:    kmovw %eax, %k1
5385; X86-NEXT:    vfmsubadd213ps {{.*#+}} xmm0 {%k1} {z} = (xmm1 * xmm0) -/+ xmm2
5386; X86-NEXT:    retl
5387;
5388; X64-LABEL: test_mm_maskz_fmsubadd_ps:
5389; X64:       # %bb.0: # %entry
5390; X64-NEXT:    kmovw %edi, %k1
5391; X64-NEXT:    vfmsubadd213ps {{.*#+}} xmm0 {%k1} {z} = (xmm1 * xmm0) -/+ xmm2
5392; X64-NEXT:    retq
5393entry:
5394  %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
5395  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %sub.i) #9
5396  %1 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C) #9
5397  %2 = shufflevector <4 x float> %1, <4 x float> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
5398  %3 = bitcast i8 %__U to <8 x i1>
5399  %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
5400  %4 = select <4 x i1> %extract.i, <4 x float> %2, <4 x float> zeroinitializer
5401  ret <4 x float> %4
5402}
5403
5404define <8 x float> @test_mm256_mask_fmaddsub_ps(<8 x float> %__A, i8 zeroext %__U, <8 x float> %__B, <8 x float> %__C) {
5405; X86-LABEL: test_mm256_mask_fmaddsub_ps:
5406; X86:       # %bb.0: # %entry
5407; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
5408; X86-NEXT:    kmovw %eax, %k1
5409; X86-NEXT:    vfmaddsub132ps {{.*#+}} ymm0 {%k1} = (ymm0 * ymm1) +/- ymm2
5410; X86-NEXT:    retl
5411;
5412; X64-LABEL: test_mm256_mask_fmaddsub_ps:
5413; X64:       # %bb.0: # %entry
5414; X64-NEXT:    kmovw %edi, %k1
5415; X64-NEXT:    vfmaddsub132ps {{.*#+}} ymm0 {%k1} = (ymm0 * ymm1) +/- ymm2
5416; X64-NEXT:    retq
5417entry:
5418  %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C) #9
5419  %1 = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
5420  %2 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %1) #9
5421  %3 = shufflevector <8 x float> %2, <8 x float> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
5422  %4 = bitcast i8 %__U to <8 x i1>
5423  %5 = select <8 x i1> %4, <8 x float> %3, <8 x float> %__A
5424  ret <8 x float> %5
5425}
5426
5427define <8 x float> @test_mm256_mask_fmsubadd_ps(<8 x float> %__A, i8 zeroext %__U, <8 x float> %__B, <8 x float> %__C) {
5428; X86-LABEL: test_mm256_mask_fmsubadd_ps:
5429; X86:       # %bb.0: # %entry
5430; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
5431; X86-NEXT:    kmovw %eax, %k1
5432; X86-NEXT:    vfmsubadd132ps {{.*#+}} ymm0 {%k1} = (ymm0 * ymm1) -/+ ymm2
5433; X86-NEXT:    retl
5434;
5435; X64-LABEL: test_mm256_mask_fmsubadd_ps:
5436; X64:       # %bb.0: # %entry
5437; X64-NEXT:    kmovw %edi, %k1
5438; X64-NEXT:    vfmsubadd132ps {{.*#+}} ymm0 {%k1} = (ymm0 * ymm1) -/+ ymm2
5439; X64-NEXT:    retq
5440entry:
5441  %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
5442  %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %sub.i) #9
5443  %1 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C) #9
5444  %2 = shufflevector <8 x float> %1, <8 x float> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
5445  %3 = bitcast i8 %__U to <8 x i1>
5446  %4 = select <8 x i1> %3, <8 x float> %2, <8 x float> %__A
5447  ret <8 x float> %4
5448}
5449
5450define <8 x float> @test_mm256_mask3_fmaddsub_ps(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C, i8 zeroext %__U) {
5451; X86-LABEL: test_mm256_mask3_fmaddsub_ps:
5452; X86:       # %bb.0: # %entry
5453; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
5454; X86-NEXT:    kmovw %eax, %k1
5455; X86-NEXT:    vfmaddsub231ps {{.*#+}} ymm2 {%k1} = (ymm0 * ymm1) +/- ymm2
5456; X86-NEXT:    vmovaps %ymm2, %ymm0
5457; X86-NEXT:    retl
5458;
5459; X64-LABEL: test_mm256_mask3_fmaddsub_ps:
5460; X64:       # %bb.0: # %entry
5461; X64-NEXT:    kmovw %edi, %k1
5462; X64-NEXT:    vfmaddsub231ps {{.*#+}} ymm2 {%k1} = (ymm0 * ymm1) +/- ymm2
5463; X64-NEXT:    vmovaps %ymm2, %ymm0
5464; X64-NEXT:    retq
5465entry:
5466  %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C) #9
5467  %1 = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
5468  %2 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %1) #9
5469  %3 = shufflevector <8 x float> %2, <8 x float> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
5470  %4 = bitcast i8 %__U to <8 x i1>
5471  %5 = select <8 x i1> %4, <8 x float> %3, <8 x float> %__C
5472  ret <8 x float> %5
5473}
5474
5475define <8 x float> @test_mm256_maskz_fmaddsub_ps(i8 zeroext %__U, <8 x float> %__A, <8 x float> %__B, <8 x float> %__C) {
5476; X86-LABEL: test_mm256_maskz_fmaddsub_ps:
5477; X86:       # %bb.0: # %entry
5478; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
5479; X86-NEXT:    kmovw %eax, %k1
5480; X86-NEXT:    vfmaddsub213ps {{.*#+}} ymm0 {%k1} {z} = (ymm1 * ymm0) +/- ymm2
5481; X86-NEXT:    retl
5482;
5483; X64-LABEL: test_mm256_maskz_fmaddsub_ps:
5484; X64:       # %bb.0: # %entry
5485; X64-NEXT:    kmovw %edi, %k1
5486; X64-NEXT:    vfmaddsub213ps {{.*#+}} ymm0 {%k1} {z} = (ymm1 * ymm0) +/- ymm2
5487; X64-NEXT:    retq
5488entry:
5489  %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C) #9
5490  %1 = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
5491  %2 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %1) #9
5492  %3 = shufflevector <8 x float> %2, <8 x float> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
5493  %4 = bitcast i8 %__U to <8 x i1>
5494  %5 = select <8 x i1> %4, <8 x float> %3, <8 x float> zeroinitializer
5495  ret <8 x float> %5
5496}
5497
5498define <8 x float> @test_mm256_maskz_fmsubadd_ps(i8 zeroext %__U, <8 x float> %__A, <8 x float> %__B, <8 x float> %__C) {
5499; X86-LABEL: test_mm256_maskz_fmsubadd_ps:
5500; X86:       # %bb.0: # %entry
5501; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
5502; X86-NEXT:    kmovw %eax, %k1
5503; X86-NEXT:    vfmsubadd213ps {{.*#+}} ymm0 {%k1} {z} = (ymm1 * ymm0) -/+ ymm2
5504; X86-NEXT:    retl
5505;
5506; X64-LABEL: test_mm256_maskz_fmsubadd_ps:
5507; X64:       # %bb.0: # %entry
5508; X64-NEXT:    kmovw %edi, %k1
5509; X64-NEXT:    vfmsubadd213ps {{.*#+}} ymm0 {%k1} {z} = (ymm1 * ymm0) -/+ ymm2
5510; X64-NEXT:    retq
5511entry:
5512  %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
5513  %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %sub.i) #9
5514  %1 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C) #9
5515  %2 = shufflevector <8 x float> %1, <8 x float> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
5516  %3 = bitcast i8 %__U to <8 x i1>
5517  %4 = select <8 x i1> %3, <8 x float> %2, <8 x float> zeroinitializer
5518  ret <8 x float> %4
5519}
5520
5521define <2 x double> @test_mm_mask3_fmsub_pd(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C, i8 zeroext %__U) {
5522; X86-LABEL: test_mm_mask3_fmsub_pd:
5523; X86:       # %bb.0: # %entry
5524; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
5525; X86-NEXT:    kmovw %eax, %k1
5526; X86-NEXT:    vfmsub231pd {{.*#+}} xmm2 {%k1} = (xmm0 * xmm1) - xmm2
5527; X86-NEXT:    vmovapd %xmm2, %xmm0
5528; X86-NEXT:    retl
5529;
5530; X64-LABEL: test_mm_mask3_fmsub_pd:
5531; X64:       # %bb.0: # %entry
5532; X64-NEXT:    kmovw %edi, %k1
5533; X64-NEXT:    vfmsub231pd {{.*#+}} xmm2 {%k1} = (xmm0 * xmm1) - xmm2
5534; X64-NEXT:    vmovapd %xmm2, %xmm0
5535; X64-NEXT:    retq
5536entry:
5537  %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__C
5538  %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %sub.i) #9
5539  %1 = bitcast i8 %__U to <8 x i1>
5540  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
5541  %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> %__C
5542  ret <2 x double> %2
5543}
5544
5545define <4 x double> @test_mm256_mask3_fmsub_pd(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C, i8 zeroext %__U) {
5546; X86-LABEL: test_mm256_mask3_fmsub_pd:
5547; X86:       # %bb.0: # %entry
5548; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
5549; X86-NEXT:    kmovw %eax, %k1
5550; X86-NEXT:    vfmsub231pd {{.*#+}} ymm2 {%k1} = (ymm0 * ymm1) - ymm2
5551; X86-NEXT:    vmovapd %ymm2, %ymm0
5552; X86-NEXT:    retl
5553;
5554; X64-LABEL: test_mm256_mask3_fmsub_pd:
5555; X64:       # %bb.0: # %entry
5556; X64-NEXT:    kmovw %edi, %k1
5557; X64-NEXT:    vfmsub231pd {{.*#+}} ymm2 {%k1} = (ymm0 * ymm1) - ymm2
5558; X64-NEXT:    vmovapd %ymm2, %ymm0
5559; X64-NEXT:    retq
5560entry:
5561  %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
5562  %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %sub.i) #9
5563  %1 = bitcast i8 %__U to <8 x i1>
5564  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
5565  %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> %__C
5566  ret <4 x double> %2
5567}
5568
5569define <4 x float> @test_mm_mask3_fmsub_ps(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C, i8 zeroext %__U) {
5570; X86-LABEL: test_mm_mask3_fmsub_ps:
5571; X86:       # %bb.0: # %entry
5572; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
5573; X86-NEXT:    kmovw %eax, %k1
5574; X86-NEXT:    vfmsub231ps {{.*#+}} xmm2 {%k1} = (xmm0 * xmm1) - xmm2
5575; X86-NEXT:    vmovaps %xmm2, %xmm0
5576; X86-NEXT:    retl
5577;
5578; X64-LABEL: test_mm_mask3_fmsub_ps:
5579; X64:       # %bb.0: # %entry
5580; X64-NEXT:    kmovw %edi, %k1
5581; X64-NEXT:    vfmsub231ps {{.*#+}} xmm2 {%k1} = (xmm0 * xmm1) - xmm2
5582; X64-NEXT:    vmovaps %xmm2, %xmm0
5583; X64-NEXT:    retq
5584entry:
5585  %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
5586  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %sub.i) #9
5587  %1 = bitcast i8 %__U to <8 x i1>
5588  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
5589  %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> %__C
5590  ret <4 x float> %2
5591}
5592
5593define <8 x float> @test_mm256_mask3_fmsub_ps(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C, i8 zeroext %__U) {
5594; X86-LABEL: test_mm256_mask3_fmsub_ps:
5595; X86:       # %bb.0: # %entry
5596; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
5597; X86-NEXT:    kmovw %eax, %k1
5598; X86-NEXT:    vfmsub231ps {{.*#+}} ymm2 {%k1} = (ymm0 * ymm1) - ymm2
5599; X86-NEXT:    vmovaps %ymm2, %ymm0
5600; X86-NEXT:    retl
5601;
5602; X64-LABEL: test_mm256_mask3_fmsub_ps:
5603; X64:       # %bb.0: # %entry
5604; X64-NEXT:    kmovw %edi, %k1
5605; X64-NEXT:    vfmsub231ps {{.*#+}} ymm2 {%k1} = (ymm0 * ymm1) - ymm2
5606; X64-NEXT:    vmovaps %ymm2, %ymm0
5607; X64-NEXT:    retq
5608entry:
5609  %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
5610  %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %sub.i) #9
5611  %1 = bitcast i8 %__U to <8 x i1>
5612  %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> %__C
5613  ret <8 x float> %2
5614}
5615
5616define <2 x double> @test_mm_mask3_fmsubadd_pd(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C, i8 zeroext %__U) {
5617; X86-LABEL: test_mm_mask3_fmsubadd_pd:
5618; X86:       # %bb.0: # %entry
5619; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
5620; X86-NEXT:    kmovw %eax, %k1
5621; X86-NEXT:    vfmsubadd231pd {{.*#+}} xmm2 {%k1} = (xmm0 * xmm1) -/+ xmm2
5622; X86-NEXT:    vmovapd %xmm2, %xmm0
5623; X86-NEXT:    retl
5624;
5625; X64-LABEL: test_mm_mask3_fmsubadd_pd:
5626; X64:       # %bb.0: # %entry
5627; X64-NEXT:    kmovw %edi, %k1
5628; X64-NEXT:    vfmsubadd231pd {{.*#+}} xmm2 {%k1} = (xmm0 * xmm1) -/+ xmm2
5629; X64-NEXT:    vmovapd %xmm2, %xmm0
5630; X64-NEXT:    retq
5631entry:
5632  %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__C
5633  %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %sub.i) #9
5634  %1 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C) #9
5635  %2 = shufflevector <2 x double> %1, <2 x double> %0, <2 x i32> <i32 0, i32 3>
5636  %3 = bitcast i8 %__U to <8 x i1>
5637  %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
5638  %4 = select <2 x i1> %extract.i, <2 x double> %2, <2 x double> %__C
5639  ret <2 x double> %4
5640}
5641
5642define <4 x double> @test_mm256_mask3_fmsubadd_pd(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C, i8 zeroext %__U) {
5643; X86-LABEL: test_mm256_mask3_fmsubadd_pd:
5644; X86:       # %bb.0: # %entry
5645; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
5646; X86-NEXT:    kmovw %eax, %k1
5647; X86-NEXT:    vfmsubadd231pd {{.*#+}} ymm2 {%k1} = (ymm0 * ymm1) -/+ ymm2
5648; X86-NEXT:    vmovapd %ymm2, %ymm0
5649; X86-NEXT:    retl
5650;
5651; X64-LABEL: test_mm256_mask3_fmsubadd_pd:
5652; X64:       # %bb.0: # %entry
5653; X64-NEXT:    kmovw %edi, %k1
5654; X64-NEXT:    vfmsubadd231pd {{.*#+}} ymm2 {%k1} = (ymm0 * ymm1) -/+ ymm2
5655; X64-NEXT:    vmovapd %ymm2, %ymm0
5656; X64-NEXT:    retq
5657entry:
5658  %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
5659  %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %sub.i) #9
5660  %1 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C) #9
5661  %2 = shufflevector <4 x double> %1, <4 x double> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
5662  %3 = bitcast i8 %__U to <8 x i1>
5663  %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
5664  %4 = select <4 x i1> %extract.i, <4 x double> %2, <4 x double> %__C
5665  ret <4 x double> %4
5666}
5667
5668define <4 x float> @test_mm_mask3_fmsubadd_ps(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C, i8 zeroext %__U) {
5669; X86-LABEL: test_mm_mask3_fmsubadd_ps:
5670; X86:       # %bb.0: # %entry
5671; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
5672; X86-NEXT:    kmovw %eax, %k1
5673; X86-NEXT:    vfmsubadd231ps {{.*#+}} xmm2 {%k1} = (xmm0 * xmm1) -/+ xmm2
5674; X86-NEXT:    vmovaps %xmm2, %xmm0
5675; X86-NEXT:    retl
5676;
5677; X64-LABEL: test_mm_mask3_fmsubadd_ps:
5678; X64:       # %bb.0: # %entry
5679; X64-NEXT:    kmovw %edi, %k1
5680; X64-NEXT:    vfmsubadd231ps {{.*#+}} xmm2 {%k1} = (xmm0 * xmm1) -/+ xmm2
5681; X64-NEXT:    vmovaps %xmm2, %xmm0
5682; X64-NEXT:    retq
5683entry:
5684  %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
5685  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %sub.i) #9
5686  %1 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C) #9
5687  %2 = shufflevector <4 x float> %1, <4 x float> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
5688  %3 = bitcast i8 %__U to <8 x i1>
5689  %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
5690  %4 = select <4 x i1> %extract.i, <4 x float> %2, <4 x float> %__C
5691  ret <4 x float> %4
5692}
5693
5694define <8 x float> @test_mm256_mask3_fmsubadd_ps(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C, i8 zeroext %__U) {
5695; X86-LABEL: test_mm256_mask3_fmsubadd_ps:
5696; X86:       # %bb.0: # %entry
5697; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
5698; X86-NEXT:    kmovw %eax, %k1
5699; X86-NEXT:    vfmsubadd231ps {{.*#+}} ymm2 {%k1} = (ymm0 * ymm1) -/+ ymm2
5700; X86-NEXT:    vmovaps %ymm2, %ymm0
5701; X86-NEXT:    retl
5702;
5703; X64-LABEL: test_mm256_mask3_fmsubadd_ps:
5704; X64:       # %bb.0: # %entry
5705; X64-NEXT:    kmovw %edi, %k1
5706; X64-NEXT:    vfmsubadd231ps {{.*#+}} ymm2 {%k1} = (ymm0 * ymm1) -/+ ymm2
5707; X64-NEXT:    vmovaps %ymm2, %ymm0
5708; X64-NEXT:    retq
5709entry:
5710  %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
5711  %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %sub.i) #9
5712  %1 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C) #9
5713  %2 = shufflevector <8 x float> %1, <8 x float> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
5714  %3 = bitcast i8 %__U to <8 x i1>
5715  %4 = select <8 x i1> %3, <8 x float> %2, <8 x float> %__C
5716  ret <8 x float> %4
5717}
5718
5719define <2 x double> @test_mm_mask_fnmadd_pd(<2 x double> %__A, i8 zeroext %__U, <2 x double> %__B, <2 x double> %__C) {
5720; X86-LABEL: test_mm_mask_fnmadd_pd:
5721; X86:       # %bb.0: # %entry
5722; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
5723; X86-NEXT:    kmovw %eax, %k1
5724; X86-NEXT:    vfnmadd132pd {{.*#+}} xmm0 {%k1} = -(xmm0 * xmm1) + xmm2
5725; X86-NEXT:    retl
5726;
5727; X64-LABEL: test_mm_mask_fnmadd_pd:
5728; X64:       # %bb.0: # %entry
5729; X64-NEXT:    kmovw %edi, %k1
5730; X64-NEXT:    vfnmadd132pd {{.*#+}} xmm0 {%k1} = -(xmm0 * xmm1) + xmm2
5731; X64-NEXT:    retq
5732entry:
5733  %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__B
5734  %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %sub.i, <2 x double> %__C) #9
5735  %1 = bitcast i8 %__U to <8 x i1>
5736  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
5737  %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> %__A
5738  ret <2 x double> %2
5739}
5740
5741define <4 x double> @test_mm256_mask_fnmadd_pd(<4 x double> %__A, i8 zeroext %__U, <4 x double> %__B, <4 x double> %__C) {
5742; X86-LABEL: test_mm256_mask_fnmadd_pd:
5743; X86:       # %bb.0: # %entry
5744; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
5745; X86-NEXT:    kmovw %eax, %k1
5746; X86-NEXT:    vfnmadd132pd {{.*#+}} ymm0 {%k1} = -(ymm0 * ymm1) + ymm2
5747; X86-NEXT:    retl
5748;
5749; X64-LABEL: test_mm256_mask_fnmadd_pd:
5750; X64:       # %bb.0: # %entry
5751; X64-NEXT:    kmovw %edi, %k1
5752; X64-NEXT:    vfnmadd132pd {{.*#+}} ymm0 {%k1} = -(ymm0 * ymm1) + ymm2
5753; X64-NEXT:    retq
5754entry:
5755  %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__B
5756  %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %sub.i, <4 x double> %__C) #9
5757  %1 = bitcast i8 %__U to <8 x i1>
5758  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
5759  %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> %__A
5760  ret <4 x double> %2
5761}
5762
5763define <4 x float> @test_mm_mask_fnmadd_ps(<4 x float> %__A, i8 zeroext %__U, <4 x float> %__B, <4 x float> %__C) {
5764; X86-LABEL: test_mm_mask_fnmadd_ps:
5765; X86:       # %bb.0: # %entry
5766; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
5767; X86-NEXT:    kmovw %eax, %k1
5768; X86-NEXT:    vfnmadd132ps {{.*#+}} xmm0 {%k1} = -(xmm0 * xmm1) + xmm2
5769; X86-NEXT:    retl
5770;
5771; X64-LABEL: test_mm_mask_fnmadd_ps:
5772; X64:       # %bb.0: # %entry
5773; X64-NEXT:    kmovw %edi, %k1
5774; X64-NEXT:    vfnmadd132ps {{.*#+}} xmm0 {%k1} = -(xmm0 * xmm1) + xmm2
5775; X64-NEXT:    retq
5776entry:
5777  %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__B
5778  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %sub.i, <4 x float> %__C) #9
5779  %1 = bitcast i8 %__U to <8 x i1>
5780  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
5781  %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> %__A
5782  ret <4 x float> %2
5783}
5784
5785define <8 x float> @test_mm256_mask_fnmadd_ps(<8 x float> %__A, i8 zeroext %__U, <8 x float> %__B, <8 x float> %__C) {
5786; X86-LABEL: test_mm256_mask_fnmadd_ps:
5787; X86:       # %bb.0: # %entry
5788; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
5789; X86-NEXT:    kmovw %eax, %k1
5790; X86-NEXT:    vfnmadd132ps {{.*#+}} ymm0 {%k1} = -(ymm0 * ymm1) + ymm2
5791; X86-NEXT:    retl
5792;
5793; X64-LABEL: test_mm256_mask_fnmadd_ps:
5794; X64:       # %bb.0: # %entry
5795; X64-NEXT:    kmovw %edi, %k1
5796; X64-NEXT:    vfnmadd132ps {{.*#+}} ymm0 {%k1} = -(ymm0 * ymm1) + ymm2
5797; X64-NEXT:    retq
5798entry:
5799  %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__B
5800  %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %sub.i, <8 x float> %__C) #9
5801  %1 = bitcast i8 %__U to <8 x i1>
5802  %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> %__A
5803  ret <8 x float> %2
5804}
5805
5806define <2 x double> @test_mm_mask_fnmsub_pd(<2 x double> %__A, i8 zeroext %__U, <2 x double> %__B, <2 x double> %__C) {
5807; X86-LABEL: test_mm_mask_fnmsub_pd:
5808; X86:       # %bb.0: # %entry
5809; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
5810; X86-NEXT:    kmovw %eax, %k1
5811; X86-NEXT:    vfnmsub132pd {{.*#+}} xmm0 {%k1} = -(xmm0 * xmm1) - xmm2
5812; X86-NEXT:    retl
5813;
5814; X64-LABEL: test_mm_mask_fnmsub_pd:
5815; X64:       # %bb.0: # %entry
5816; X64-NEXT:    kmovw %edi, %k1
5817; X64-NEXT:    vfnmsub132pd {{.*#+}} xmm0 {%k1} = -(xmm0 * xmm1) - xmm2
5818; X64-NEXT:    retq
5819entry:
5820  %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__B
5821  %sub1.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__C
5822  %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %sub.i, <2 x double> %sub1.i) #9
5823  %1 = bitcast i8 %__U to <8 x i1>
5824  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
5825  %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> %__A
5826  ret <2 x double> %2
5827}
5828
5829define <2 x double> @test_mm_mask3_fnmsub_pd(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C, i8 zeroext %__U) {
5830; X86-LABEL: test_mm_mask3_fnmsub_pd:
5831; X86:       # %bb.0: # %entry
5832; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
5833; X86-NEXT:    kmovw %eax, %k1
5834; X86-NEXT:    vfnmsub231pd {{.*#+}} xmm2 {%k1} = -(xmm0 * xmm1) - xmm2
5835; X86-NEXT:    vmovapd %xmm2, %xmm0
5836; X86-NEXT:    retl
5837;
5838; X64-LABEL: test_mm_mask3_fnmsub_pd:
5839; X64:       # %bb.0: # %entry
5840; X64-NEXT:    kmovw %edi, %k1
5841; X64-NEXT:    vfnmsub231pd {{.*#+}} xmm2 {%k1} = -(xmm0 * xmm1) - xmm2
5842; X64-NEXT:    vmovapd %xmm2, %xmm0
5843; X64-NEXT:    retq
5844entry:
5845  %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__B
5846  %sub1.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__C
5847  %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %sub.i, <2 x double> %sub1.i) #9
5848  %1 = bitcast i8 %__U to <8 x i1>
5849  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
5850  %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> %__C
5851  ret <2 x double> %2
5852}
5853
5854define <4 x double> @test_mm256_mask_fnmsub_pd(<4 x double> %__A, i8 zeroext %__U, <4 x double> %__B, <4 x double> %__C) {
5855; X86-LABEL: test_mm256_mask_fnmsub_pd:
5856; X86:       # %bb.0: # %entry
5857; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
5858; X86-NEXT:    kmovw %eax, %k1
5859; X86-NEXT:    vfnmsub132pd {{.*#+}} ymm0 {%k1} = -(ymm0 * ymm1) - ymm2
5860; X86-NEXT:    retl
5861;
5862; X64-LABEL: test_mm256_mask_fnmsub_pd:
5863; X64:       # %bb.0: # %entry
5864; X64-NEXT:    kmovw %edi, %k1
5865; X64-NEXT:    vfnmsub132pd {{.*#+}} ymm0 {%k1} = -(ymm0 * ymm1) - ymm2
5866; X64-NEXT:    retq
5867entry:
5868  %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__B
5869  %sub1.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
5870  %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %sub.i, <4 x double> %sub1.i) #9
5871  %1 = bitcast i8 %__U to <8 x i1>
5872  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
5873  %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> %__A
5874  ret <4 x double> %2
5875}
5876
5877define <4 x double> @test_mm256_mask3_fnmsub_pd(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C, i8 zeroext %__U) {
5878; X86-LABEL: test_mm256_mask3_fnmsub_pd:
5879; X86:       # %bb.0: # %entry
5880; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
5881; X86-NEXT:    kmovw %eax, %k1
5882; X86-NEXT:    vfnmsub231pd {{.*#+}} ymm2 {%k1} = -(ymm0 * ymm1) - ymm2
5883; X86-NEXT:    vmovapd %ymm2, %ymm0
5884; X86-NEXT:    retl
5885;
5886; X64-LABEL: test_mm256_mask3_fnmsub_pd:
5887; X64:       # %bb.0: # %entry
5888; X64-NEXT:    kmovw %edi, %k1
5889; X64-NEXT:    vfnmsub231pd {{.*#+}} ymm2 {%k1} = -(ymm0 * ymm1) - ymm2
5890; X64-NEXT:    vmovapd %ymm2, %ymm0
5891; X64-NEXT:    retq
5892entry:
5893  %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__B
5894  %sub1.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
5895  %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %sub.i, <4 x double> %sub1.i) #9
5896  %1 = bitcast i8 %__U to <8 x i1>
5897  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
5898  %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> %__C
5899  ret <4 x double> %2
5900}
5901
5902define <4 x float> @test_mm_mask_fnmsub_ps(<4 x float> %__A, i8 zeroext %__U, <4 x float> %__B, <4 x float> %__C) {
5903; X86-LABEL: test_mm_mask_fnmsub_ps:
5904; X86:       # %bb.0: # %entry
5905; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
5906; X86-NEXT:    kmovw %eax, %k1
5907; X86-NEXT:    vfnmsub132ps {{.*#+}} xmm0 {%k1} = -(xmm0 * xmm1) - xmm2
5908; X86-NEXT:    retl
5909;
5910; X64-LABEL: test_mm_mask_fnmsub_ps:
5911; X64:       # %bb.0: # %entry
5912; X64-NEXT:    kmovw %edi, %k1
5913; X64-NEXT:    vfnmsub132ps {{.*#+}} xmm0 {%k1} = -(xmm0 * xmm1) - xmm2
5914; X64-NEXT:    retq
5915entry:
5916  %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__B
5917  %sub1.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
5918  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %sub.i, <4 x float> %sub1.i) #9
5919  %1 = bitcast i8 %__U to <8 x i1>
5920  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
5921  %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> %__A
5922  ret <4 x float> %2
5923}
5924
5925define <4 x float> @test_mm_mask3_fnmsub_ps(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C, i8 zeroext %__U) {
5926; X86-LABEL: test_mm_mask3_fnmsub_ps:
5927; X86:       # %bb.0: # %entry
5928; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
5929; X86-NEXT:    kmovw %eax, %k1
5930; X86-NEXT:    vfnmsub231ps {{.*#+}} xmm2 {%k1} = -(xmm0 * xmm1) - xmm2
5931; X86-NEXT:    vmovaps %xmm2, %xmm0
5932; X86-NEXT:    retl
5933;
5934; X64-LABEL: test_mm_mask3_fnmsub_ps:
5935; X64:       # %bb.0: # %entry
5936; X64-NEXT:    kmovw %edi, %k1
5937; X64-NEXT:    vfnmsub231ps {{.*#+}} xmm2 {%k1} = -(xmm0 * xmm1) - xmm2
5938; X64-NEXT:    vmovaps %xmm2, %xmm0
5939; X64-NEXT:    retq
5940entry:
5941  %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__B
5942  %sub1.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
5943  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %sub.i, <4 x float> %sub1.i) #9
5944  %1 = bitcast i8 %__U to <8 x i1>
5945  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
5946  %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> %__C
5947  ret <4 x float> %2
5948}
5949
5950define <8 x float> @test_mm256_mask_fnmsub_ps(<8 x float> %__A, i8 zeroext %__U, <8 x float> %__B, <8 x float> %__C) {
5951; X86-LABEL: test_mm256_mask_fnmsub_ps:
5952; X86:       # %bb.0: # %entry
5953; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
5954; X86-NEXT:    kmovw %eax, %k1
5955; X86-NEXT:    vfnmsub132ps {{.*#+}} ymm0 {%k1} = -(ymm0 * ymm1) - ymm2
5956; X86-NEXT:    retl
5957;
5958; X64-LABEL: test_mm256_mask_fnmsub_ps:
5959; X64:       # %bb.0: # %entry
5960; X64-NEXT:    kmovw %edi, %k1
5961; X64-NEXT:    vfnmsub132ps {{.*#+}} ymm0 {%k1} = -(ymm0 * ymm1) - ymm2
5962; X64-NEXT:    retq
5963entry:
5964  %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__B
5965  %sub1.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
5966  %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %sub.i, <8 x float> %sub1.i) #9
5967  %1 = bitcast i8 %__U to <8 x i1>
5968  %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> %__A
5969  ret <8 x float> %2
5970}
5971
5972define <8 x float> @test_mm256_mask3_fnmsub_ps(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C, i8 zeroext %__U) {
5973; X86-LABEL: test_mm256_mask3_fnmsub_ps:
5974; X86:       # %bb.0: # %entry
5975; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
5976; X86-NEXT:    kmovw %eax, %k1
5977; X86-NEXT:    vfnmsub231ps {{.*#+}} ymm2 {%k1} = -(ymm0 * ymm1) - ymm2
5978; X86-NEXT:    vmovaps %ymm2, %ymm0
5979; X86-NEXT:    retl
5980;
5981; X64-LABEL: test_mm256_mask3_fnmsub_ps:
5982; X64:       # %bb.0: # %entry
5983; X64-NEXT:    kmovw %edi, %k1
5984; X64-NEXT:    vfnmsub231ps {{.*#+}} ymm2 {%k1} = -(ymm0 * ymm1) - ymm2
5985; X64-NEXT:    vmovaps %ymm2, %ymm0
5986; X64-NEXT:    retq
5987entry:
5988  %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__B
5989  %sub1.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
5990  %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %sub.i, <8 x float> %sub1.i) #9
5991  %1 = bitcast i8 %__U to <8 x i1>
5992  %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> %__C
5993  ret <8 x float> %2
5994}
5995
5996define <2 x double> @test_mm_mask_expandloadu_pd(<2 x double> %__W, i8 zeroext %__U, i8* readonly %__P) {
5997; X86-LABEL: test_mm_mask_expandloadu_pd:
5998; X86:       # %bb.0: # %entry
5999; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
6000; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
6001; X86-NEXT:    kmovw %ecx, %k1
6002; X86-NEXT:    vexpandpd (%eax), %xmm0 {%k1}
6003; X86-NEXT:    retl
6004;
6005; X64-LABEL: test_mm_mask_expandloadu_pd:
6006; X64:       # %bb.0: # %entry
6007; X64-NEXT:    kmovw %edi, %k1
6008; X64-NEXT:    vexpandpd (%rsi), %xmm0 {%k1}
6009; X64-NEXT:    retq
6010entry:
6011  %0 = bitcast i8* %__P to double*
6012  %1 = bitcast i8 %__U to <8 x i1>
6013  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
6014  %2 = tail call <2 x double> @llvm.masked.expandload.v2f64(double* %0, <2 x i1> %extract.i, <2 x double> %__W)
6015  ret <2 x double> %2
6016}
6017
6018define <2 x double> @test_mm_maskz_expandloadu_pd(i8 zeroext %__U, i8* readonly %__P) {
6019; X86-LABEL: test_mm_maskz_expandloadu_pd:
6020; X86:       # %bb.0: # %entry
6021; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
6022; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
6023; X86-NEXT:    kmovw %ecx, %k1
6024; X86-NEXT:    vexpandpd (%eax), %xmm0 {%k1} {z}
6025; X86-NEXT:    retl
6026;
6027; X64-LABEL: test_mm_maskz_expandloadu_pd:
6028; X64:       # %bb.0: # %entry
6029; X64-NEXT:    kmovw %edi, %k1
6030; X64-NEXT:    vexpandpd (%rsi), %xmm0 {%k1} {z}
6031; X64-NEXT:    retq
6032entry:
6033  %0 = bitcast i8* %__P to double*
6034  %1 = bitcast i8 %__U to <8 x i1>
6035  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
6036  %2 = tail call <2 x double> @llvm.masked.expandload.v2f64(double* %0, <2 x i1> %extract.i, <2 x double> zeroinitializer)
6037  ret <2 x double> %2
6038}
6039
6040define <4 x double> @test_mm256_mask_expandloadu_pd(<4 x double> %__W, i8 zeroext %__U, i8* readonly %__P) {
6041; X86-LABEL: test_mm256_mask_expandloadu_pd:
6042; X86:       # %bb.0: # %entry
6043; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
6044; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
6045; X86-NEXT:    kmovw %ecx, %k1
6046; X86-NEXT:    vexpandpd (%eax), %ymm0 {%k1}
6047; X86-NEXT:    retl
6048;
6049; X64-LABEL: test_mm256_mask_expandloadu_pd:
6050; X64:       # %bb.0: # %entry
6051; X64-NEXT:    kmovw %edi, %k1
6052; X64-NEXT:    vexpandpd (%rsi), %ymm0 {%k1}
6053; X64-NEXT:    retq
6054entry:
6055  %0 = bitcast i8* %__P to double*
6056  %1 = bitcast i8 %__U to <8 x i1>
6057  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6058  %2 = tail call <4 x double> @llvm.masked.expandload.v4f64(double* %0, <4 x i1> %extract.i, <4 x double> %__W)
6059  ret <4 x double> %2
6060}
6061
6062define <4 x double> @test_mm256_maskz_expandloadu_pd(i8 zeroext %__U, i8* readonly %__P) {
6063; X86-LABEL: test_mm256_maskz_expandloadu_pd:
6064; X86:       # %bb.0: # %entry
6065; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
6066; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
6067; X86-NEXT:    kmovw %ecx, %k1
6068; X86-NEXT:    vexpandpd (%eax), %ymm0 {%k1} {z}
6069; X86-NEXT:    retl
6070;
6071; X64-LABEL: test_mm256_maskz_expandloadu_pd:
6072; X64:       # %bb.0: # %entry
6073; X64-NEXT:    kmovw %edi, %k1
6074; X64-NEXT:    vexpandpd (%rsi), %ymm0 {%k1} {z}
6075; X64-NEXT:    retq
6076entry:
6077  %0 = bitcast i8* %__P to double*
6078  %1 = bitcast i8 %__U to <8 x i1>
6079  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6080  %2 = tail call <4 x double> @llvm.masked.expandload.v4f64(double* %0, <4 x i1> %extract.i, <4 x double> zeroinitializer)
6081  ret <4 x double> %2
6082}
6083
6084define <2 x i64> @test_mm_mask_expandloadu_epi64(<2 x i64> %__W, i8 zeroext %__U, i8* readonly %__P) {
6085; X86-LABEL: test_mm_mask_expandloadu_epi64:
6086; X86:       # %bb.0: # %entry
6087; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
6088; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
6089; X86-NEXT:    kmovw %ecx, %k1
6090; X86-NEXT:    vpexpandq (%eax), %xmm0 {%k1}
6091; X86-NEXT:    retl
6092;
6093; X64-LABEL: test_mm_mask_expandloadu_epi64:
6094; X64:       # %bb.0: # %entry
6095; X64-NEXT:    kmovw %edi, %k1
6096; X64-NEXT:    vpexpandq (%rsi), %xmm0 {%k1}
6097; X64-NEXT:    retq
6098entry:
6099  %0 = bitcast i8* %__P to i64*
6100  %1 = bitcast i8 %__U to <8 x i1>
6101  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
6102  %2 = tail call <2 x i64> @llvm.masked.expandload.v2i64(i64* %0, <2 x i1> %extract.i, <2 x i64> %__W) #10
6103  ret <2 x i64> %2
6104}
6105
6106define <2 x i64> @test_mm_maskz_expandloadu_epi64(i8 zeroext %__U, i8* readonly %__P) {
6107; X86-LABEL: test_mm_maskz_expandloadu_epi64:
6108; X86:       # %bb.0: # %entry
6109; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
6110; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
6111; X86-NEXT:    kmovw %ecx, %k1
6112; X86-NEXT:    vpexpandq (%eax), %xmm0 {%k1} {z}
6113; X86-NEXT:    retl
6114;
6115; X64-LABEL: test_mm_maskz_expandloadu_epi64:
6116; X64:       # %bb.0: # %entry
6117; X64-NEXT:    kmovw %edi, %k1
6118; X64-NEXT:    vpexpandq (%rsi), %xmm0 {%k1} {z}
6119; X64-NEXT:    retq
6120entry:
6121  %0 = bitcast i8* %__P to i64*
6122  %1 = bitcast i8 %__U to <8 x i1>
6123  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
6124  %2 = tail call <2 x i64> @llvm.masked.expandload.v2i64(i64* %0, <2 x i1> %extract.i, <2 x i64> zeroinitializer)
6125  ret <2 x i64> %2
6126}
6127
6128define <4 x i64> @test_mm256_mask_expandloadu_epi64(<4 x i64> %__W, i8 zeroext %__U, i8* readonly %__P) {
6129; X86-LABEL: test_mm256_mask_expandloadu_epi64:
6130; X86:       # %bb.0: # %entry
6131; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
6132; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
6133; X86-NEXT:    kmovw %ecx, %k1
6134; X86-NEXT:    vpexpandq (%eax), %ymm0 {%k1}
6135; X86-NEXT:    retl
6136;
6137; X64-LABEL: test_mm256_mask_expandloadu_epi64:
6138; X64:       # %bb.0: # %entry
6139; X64-NEXT:    kmovw %edi, %k1
6140; X64-NEXT:    vpexpandq (%rsi), %ymm0 {%k1}
6141; X64-NEXT:    retq
6142entry:
6143  %0 = bitcast i8* %__P to i64*
6144  %1 = bitcast i8 %__U to <8 x i1>
6145  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6146  %2 = tail call <4 x i64> @llvm.masked.expandload.v4i64(i64* %0, <4 x i1> %extract.i, <4 x i64> %__W) #10
6147  ret <4 x i64> %2
6148}
6149
6150define <4 x i64> @test_mm256_maskz_expandloadu_epi64(i8 zeroext %__U, i8* readonly %__P) {
6151; X86-LABEL: test_mm256_maskz_expandloadu_epi64:
6152; X86:       # %bb.0: # %entry
6153; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
6154; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
6155; X86-NEXT:    kmovw %ecx, %k1
6156; X86-NEXT:    vpexpandq (%eax), %ymm0 {%k1} {z}
6157; X86-NEXT:    retl
6158;
6159; X64-LABEL: test_mm256_maskz_expandloadu_epi64:
6160; X64:       # %bb.0: # %entry
6161; X64-NEXT:    kmovw %edi, %k1
6162; X64-NEXT:    vpexpandq (%rsi), %ymm0 {%k1} {z}
6163; X64-NEXT:    retq
6164entry:
6165  %0 = bitcast i8* %__P to i64*
6166  %1 = bitcast i8 %__U to <8 x i1>
6167  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6168  %2 = tail call <4 x i64> @llvm.masked.expandload.v4i64(i64* %0, <4 x i1> %extract.i, <4 x i64> zeroinitializer)
6169  ret <4 x i64> %2
6170}
6171
6172define <4 x float> @test_mm_mask_expandloadu_ps(<4 x float> %__W, i8 zeroext %__U, i8* readonly %__P) {
6173; X86-LABEL: test_mm_mask_expandloadu_ps:
6174; X86:       # %bb.0: # %entry
6175; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
6176; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
6177; X86-NEXT:    kmovw %ecx, %k1
6178; X86-NEXT:    vexpandps (%eax), %xmm0 {%k1}
6179; X86-NEXT:    retl
6180;
6181; X64-LABEL: test_mm_mask_expandloadu_ps:
6182; X64:       # %bb.0: # %entry
6183; X64-NEXT:    kmovw %edi, %k1
6184; X64-NEXT:    vexpandps (%rsi), %xmm0 {%k1}
6185; X64-NEXT:    retq
6186entry:
6187  %0 = bitcast i8* %__P to float*
6188  %1 = bitcast i8 %__U to <8 x i1>
6189  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6190  %2 = tail call <4 x float> @llvm.masked.expandload.v4f32(float* %0, <4 x i1> %extract.i, <4 x float> %__W)
6191  ret <4 x float> %2
6192}
6193
6194define <4 x float> @test_mm_maskz_expandloadu_ps(i8 zeroext %__U, i8* readonly %__P) {
6195; X86-LABEL: test_mm_maskz_expandloadu_ps:
6196; X86:       # %bb.0: # %entry
6197; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
6198; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
6199; X86-NEXT:    kmovw %ecx, %k1
6200; X86-NEXT:    vexpandps (%eax), %xmm0 {%k1} {z}
6201; X86-NEXT:    retl
6202;
6203; X64-LABEL: test_mm_maskz_expandloadu_ps:
6204; X64:       # %bb.0: # %entry
6205; X64-NEXT:    kmovw %edi, %k1
6206; X64-NEXT:    vexpandps (%rsi), %xmm0 {%k1} {z}
6207; X64-NEXT:    retq
6208entry:
6209  %0 = bitcast i8* %__P to float*
6210  %1 = bitcast i8 %__U to <8 x i1>
6211  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6212  %2 = tail call <4 x float> @llvm.masked.expandload.v4f32(float* %0, <4 x i1> %extract.i, <4 x float> zeroinitializer)
6213  ret <4 x float> %2
6214}
6215
6216define <8 x float> @test_mm256_mask_expandloadu_ps(<8 x float> %__W, i8 zeroext %__U, i8* readonly %__P) {
6217; X86-LABEL: test_mm256_mask_expandloadu_ps:
6218; X86:       # %bb.0: # %entry
6219; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
6220; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
6221; X86-NEXT:    kmovw %ecx, %k1
6222; X86-NEXT:    vexpandps (%eax), %ymm0 {%k1}
6223; X86-NEXT:    retl
6224;
6225; X64-LABEL: test_mm256_mask_expandloadu_ps:
6226; X64:       # %bb.0: # %entry
6227; X64-NEXT:    kmovw %edi, %k1
6228; X64-NEXT:    vexpandps (%rsi), %ymm0 {%k1}
6229; X64-NEXT:    retq
6230entry:
6231  %0 = bitcast i8* %__P to float*
6232  %1 = bitcast i8 %__U to <8 x i1>
6233  %2 = tail call <8 x float> @llvm.masked.expandload.v8f32(float* %0, <8 x i1> %1, <8 x float> %__W)
6234  ret <8 x float> %2
6235}
6236
6237define <8 x float> @test_mm256_maskz_expandloadu_ps(i8 zeroext %__U, i8* readonly %__P) {
6238; X86-LABEL: test_mm256_maskz_expandloadu_ps:
6239; X86:       # %bb.0: # %entry
6240; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
6241; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
6242; X86-NEXT:    kmovw %ecx, %k1
6243; X86-NEXT:    vexpandps (%eax), %ymm0 {%k1} {z}
6244; X86-NEXT:    retl
6245;
6246; X64-LABEL: test_mm256_maskz_expandloadu_ps:
6247; X64:       # %bb.0: # %entry
6248; X64-NEXT:    kmovw %edi, %k1
6249; X64-NEXT:    vexpandps (%rsi), %ymm0 {%k1} {z}
6250; X64-NEXT:    retq
6251entry:
6252  %0 = bitcast i8* %__P to float*
6253  %1 = bitcast i8 %__U to <8 x i1>
6254  %2 = tail call <8 x float> @llvm.masked.expandload.v8f32(float* %0, <8 x i1> %1, <8 x float> zeroinitializer)
6255  ret <8 x float> %2
6256}
6257
6258define <2 x i64> @test_mm_mask_expandloadu_epi32(<2 x i64> %__W, i8 zeroext %__U, i8* readonly %__P) {
6259; X86-LABEL: test_mm_mask_expandloadu_epi32:
6260; X86:       # %bb.0: # %entry
6261; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
6262; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
6263; X86-NEXT:    kmovw %ecx, %k1
6264; X86-NEXT:    vpexpandd (%eax), %xmm0 {%k1}
6265; X86-NEXT:    retl
6266;
6267; X64-LABEL: test_mm_mask_expandloadu_epi32:
6268; X64:       # %bb.0: # %entry
6269; X64-NEXT:    kmovw %edi, %k1
6270; X64-NEXT:    vpexpandd (%rsi), %xmm0 {%k1}
6271; X64-NEXT:    retq
6272entry:
6273  %0 = bitcast <2 x i64> %__W to <4 x i32>
6274  %1 = bitcast i8* %__P to i32*
6275  %2 = bitcast i8 %__U to <8 x i1>
6276  %extract.i = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6277  %3 = tail call <4 x i32> @llvm.masked.expandload.v4i32(i32* %1, <4 x i1> %extract.i, <4 x i32> %0)
6278  %4 = bitcast <4 x i32> %3 to <2 x i64>
6279  ret <2 x i64> %4
6280}
6281
6282define <2 x i64> @test_mm_maskz_expandloadu_epi32(i8 zeroext %__U, i8* readonly %__P) {
6283; X86-LABEL: test_mm_maskz_expandloadu_epi32:
6284; X86:       # %bb.0: # %entry
6285; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
6286; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
6287; X86-NEXT:    kmovw %ecx, %k1
6288; X86-NEXT:    vpexpandd (%eax), %xmm0 {%k1} {z}
6289; X86-NEXT:    retl
6290;
6291; X64-LABEL: test_mm_maskz_expandloadu_epi32:
6292; X64:       # %bb.0: # %entry
6293; X64-NEXT:    kmovw %edi, %k1
6294; X64-NEXT:    vpexpandd (%rsi), %xmm0 {%k1} {z}
6295; X64-NEXT:    retq
6296entry:
6297  %0 = bitcast i8* %__P to i32*
6298  %1 = bitcast i8 %__U to <8 x i1>
6299  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6300  %2 = tail call <4 x i32> @llvm.masked.expandload.v4i32(i32* %0, <4 x i1> %extract.i, <4 x i32> zeroinitializer)
6301  %3 = bitcast <4 x i32> %2 to <2 x i64>
6302  ret <2 x i64> %3
6303}
6304
6305define <4 x i64> @test_mm256_mask_expandloadu_epi32(<4 x i64> %__W, i8 zeroext %__U, i8* readonly %__P) {
6306; X86-LABEL: test_mm256_mask_expandloadu_epi32:
6307; X86:       # %bb.0: # %entry
6308; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
6309; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
6310; X86-NEXT:    kmovw %ecx, %k1
6311; X86-NEXT:    vpexpandd (%eax), %ymm0 {%k1}
6312; X86-NEXT:    retl
6313;
6314; X64-LABEL: test_mm256_mask_expandloadu_epi32:
6315; X64:       # %bb.0: # %entry
6316; X64-NEXT:    kmovw %edi, %k1
6317; X64-NEXT:    vpexpandd (%rsi), %ymm0 {%k1}
6318; X64-NEXT:    retq
6319entry:
6320  %0 = bitcast <4 x i64> %__W to <8 x i32>
6321  %1 = bitcast i8* %__P to i32*
6322  %2 = bitcast i8 %__U to <8 x i1>
6323  %3 = tail call <8 x i32> @llvm.masked.expandload.v8i32(i32* %1, <8 x i1> %2, <8 x i32> %0)
6324  %4 = bitcast <8 x i32> %3 to <4 x i64>
6325  ret <4 x i64> %4
6326}
6327
6328define <4 x i64> @test_mm256_maskz_expandloadu_epi32(i8 zeroext %__U, i8* readonly %__P) {
6329; X86-LABEL: test_mm256_maskz_expandloadu_epi32:
6330; X86:       # %bb.0: # %entry
6331; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
6332; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
6333; X86-NEXT:    kmovw %ecx, %k1
6334; X86-NEXT:    vpexpandd (%eax), %ymm0 {%k1} {z}
6335; X86-NEXT:    retl
6336;
6337; X64-LABEL: test_mm256_maskz_expandloadu_epi32:
6338; X64:       # %bb.0: # %entry
6339; X64-NEXT:    kmovw %edi, %k1
6340; X64-NEXT:    vpexpandd (%rsi), %ymm0 {%k1} {z}
6341; X64-NEXT:    retq
6342entry:
6343  %0 = bitcast i8* %__P to i32*
6344  %1 = bitcast i8 %__U to <8 x i1>
6345  %2 = tail call <8 x i32> @llvm.masked.expandload.v8i32(i32* %0, <8 x i1> %1, <8 x i32> zeroinitializer)
6346  %3 = bitcast <8 x i32> %2 to <4 x i64>
6347  ret <4 x i64> %3
6348}
6349
6350define void @test_mm_mask_compressstoreu_pd(i8* %__P, i8 zeroext %__U, <2 x double> %__A) {
6351; X86-LABEL: test_mm_mask_compressstoreu_pd:
6352; X86:       # %bb.0: # %entry
6353; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
6354; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
6355; X86-NEXT:    kmovw %eax, %k1
6356; X86-NEXT:    vcompresspd %xmm0, (%ecx) {%k1}
6357; X86-NEXT:    retl
6358;
6359; X64-LABEL: test_mm_mask_compressstoreu_pd:
6360; X64:       # %bb.0: # %entry
6361; X64-NEXT:    kmovw %esi, %k1
6362; X64-NEXT:    vcompresspd %xmm0, (%rdi) {%k1}
6363; X64-NEXT:    retq
6364entry:
6365  %0 = bitcast i8* %__P to double*
6366  %1 = bitcast i8 %__U to <8 x i1>
6367  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
6368  tail call void @llvm.masked.compressstore.v2f64(<2 x double> %__A, double* %0, <2 x i1> %extract.i)
6369  ret void
6370}
6371
6372define void @test_mm256_mask_compressstoreu_pd(i8* %__P, i8 zeroext %__U, <4 x double> %__A) {
6373; X86-LABEL: test_mm256_mask_compressstoreu_pd:
6374; X86:       # %bb.0: # %entry
6375; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
6376; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
6377; X86-NEXT:    kmovw %eax, %k1
6378; X86-NEXT:    vcompresspd %ymm0, (%ecx) {%k1}
6379; X86-NEXT:    vzeroupper
6380; X86-NEXT:    retl
6381;
6382; X64-LABEL: test_mm256_mask_compressstoreu_pd:
6383; X64:       # %bb.0: # %entry
6384; X64-NEXT:    kmovw %esi, %k1
6385; X64-NEXT:    vcompresspd %ymm0, (%rdi) {%k1}
6386; X64-NEXT:    vzeroupper
6387; X64-NEXT:    retq
6388entry:
6389  %0 = bitcast i8* %__P to double*
6390  %1 = bitcast i8 %__U to <8 x i1>
6391  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6392  tail call void @llvm.masked.compressstore.v4f64(<4 x double> %__A, double* %0, <4 x i1> %extract.i)
6393  ret void
6394}
6395
6396define void @test_mm_mask_compressstoreu_epi64(i8* %__P, i8 zeroext %__U, <2 x i64> %__A) {
6397; X86-LABEL: test_mm_mask_compressstoreu_epi64:
6398; X86:       # %bb.0: # %entry
6399; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
6400; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
6401; X86-NEXT:    kmovw %eax, %k1
6402; X86-NEXT:    vpcompressq %xmm0, (%ecx) {%k1}
6403; X86-NEXT:    retl
6404;
6405; X64-LABEL: test_mm_mask_compressstoreu_epi64:
6406; X64:       # %bb.0: # %entry
6407; X64-NEXT:    kmovw %esi, %k1
6408; X64-NEXT:    vpcompressq %xmm0, (%rdi) {%k1}
6409; X64-NEXT:    retq
6410entry:
6411  %0 = bitcast i8* %__P to i64*
6412  %1 = bitcast i8 %__U to <8 x i1>
6413  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
6414  tail call void @llvm.masked.compressstore.v2i64(<2 x i64> %__A, i64* %0, <2 x i1> %extract.i)
6415  ret void
6416}
6417
6418define void @test_mm256_mask_compressstoreu_epi64(i8* %__P, i8 zeroext %__U, <4 x i64> %__A) {
6419; X86-LABEL: test_mm256_mask_compressstoreu_epi64:
6420; X86:       # %bb.0: # %entry
6421; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
6422; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
6423; X86-NEXT:    kmovw %eax, %k1
6424; X86-NEXT:    vpcompressq %ymm0, (%ecx) {%k1}
6425; X86-NEXT:    vzeroupper
6426; X86-NEXT:    retl
6427;
6428; X64-LABEL: test_mm256_mask_compressstoreu_epi64:
6429; X64:       # %bb.0: # %entry
6430; X64-NEXT:    kmovw %esi, %k1
6431; X64-NEXT:    vpcompressq %ymm0, (%rdi) {%k1}
6432; X64-NEXT:    vzeroupper
6433; X64-NEXT:    retq
6434entry:
6435  %0 = bitcast i8* %__P to i64*
6436  %1 = bitcast i8 %__U to <8 x i1>
6437  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6438  tail call void @llvm.masked.compressstore.v4i64(<4 x i64> %__A, i64* %0, <4 x i1> %extract.i)
6439  ret void
6440}
6441
6442define void @test_mm_mask_compressstoreu_ps(i8* %__P, i8 zeroext %__U, <4 x float> %__A) {
6443; X86-LABEL: test_mm_mask_compressstoreu_ps:
6444; X86:       # %bb.0: # %entry
6445; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
6446; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
6447; X86-NEXT:    kmovw %eax, %k1
6448; X86-NEXT:    vcompressps %xmm0, (%ecx) {%k1}
6449; X86-NEXT:    retl
6450;
6451; X64-LABEL: test_mm_mask_compressstoreu_ps:
6452; X64:       # %bb.0: # %entry
6453; X64-NEXT:    kmovw %esi, %k1
6454; X64-NEXT:    vcompressps %xmm0, (%rdi) {%k1}
6455; X64-NEXT:    retq
6456entry:
6457  %0 = bitcast i8* %__P to float*
6458  %1 = bitcast i8 %__U to <8 x i1>
6459  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6460  tail call void @llvm.masked.compressstore.v4f32(<4 x float> %__A, float* %0, <4 x i1> %extract.i)
6461  ret void
6462}
6463
6464define void @test_mm256_mask_compressstoreu_ps(i8* %__P, i8 zeroext %__U, <8 x float> %__A) {
6465; X86-LABEL: test_mm256_mask_compressstoreu_ps:
6466; X86:       # %bb.0: # %entry
6467; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
6468; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
6469; X86-NEXT:    kmovw %eax, %k1
6470; X86-NEXT:    vcompressps %ymm0, (%ecx) {%k1}
6471; X86-NEXT:    vzeroupper
6472; X86-NEXT:    retl
6473;
6474; X64-LABEL: test_mm256_mask_compressstoreu_ps:
6475; X64:       # %bb.0: # %entry
6476; X64-NEXT:    kmovw %esi, %k1
6477; X64-NEXT:    vcompressps %ymm0, (%rdi) {%k1}
6478; X64-NEXT:    vzeroupper
6479; X64-NEXT:    retq
6480entry:
6481  %0 = bitcast i8* %__P to float*
6482  %1 = bitcast i8 %__U to <8 x i1>
6483  tail call void @llvm.masked.compressstore.v8f32(<8 x float> %__A, float* %0, <8 x i1> %1)
6484  ret void
6485}
6486
6487define void @test_mm_mask_compressstoreu_epi32(i8* %__P, i8 zeroext %__U, <2 x i64> %__A) {
6488; X86-LABEL: test_mm_mask_compressstoreu_epi32:
6489; X86:       # %bb.0: # %entry
6490; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
6491; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
6492; X86-NEXT:    kmovw %eax, %k1
6493; X86-NEXT:    vpcompressd %xmm0, (%ecx) {%k1}
6494; X86-NEXT:    retl
6495;
6496; X64-LABEL: test_mm_mask_compressstoreu_epi32:
6497; X64:       # %bb.0: # %entry
6498; X64-NEXT:    kmovw %esi, %k1
6499; X64-NEXT:    vpcompressd %xmm0, (%rdi) {%k1}
6500; X64-NEXT:    retq
6501entry:
6502  %0 = bitcast <2 x i64> %__A to <4 x i32>
6503  %1 = bitcast i8* %__P to i32*
6504  %2 = bitcast i8 %__U to <8 x i1>
6505  %extract.i = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6506  tail call void @llvm.masked.compressstore.v4i32(<4 x i32> %0, i32* %1, <4 x i1> %extract.i)
6507  ret void
6508}
6509
6510define void @test_mm256_mask_compressstoreu_epi32(i8* %__P, i8 zeroext %__U, <4 x i64> %__A) {
6511; X86-LABEL: test_mm256_mask_compressstoreu_epi32:
6512; X86:       # %bb.0: # %entry
6513; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
6514; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
6515; X86-NEXT:    kmovw %eax, %k1
6516; X86-NEXT:    vpcompressd %ymm0, (%ecx) {%k1}
6517; X86-NEXT:    vzeroupper
6518; X86-NEXT:    retl
6519;
6520; X64-LABEL: test_mm256_mask_compressstoreu_epi32:
6521; X64:       # %bb.0: # %entry
6522; X64-NEXT:    kmovw %esi, %k1
6523; X64-NEXT:    vpcompressd %ymm0, (%rdi) {%k1}
6524; X64-NEXT:    vzeroupper
6525; X64-NEXT:    retq
6526entry:
6527  %0 = bitcast <4 x i64> %__A to <8 x i32>
6528  %1 = bitcast i8* %__P to i32*
6529  %2 = bitcast i8 %__U to <8 x i1>
6530  tail call void @llvm.masked.compressstore.v8i32(<8 x i32> %0, i32* %1, <8 x i1> %2) #10
6531  ret void
6532}
6533
6534
6535declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>) #8
6536declare <4 x double> @llvm.fma.v4f64(<4 x double>, <4 x double>, <4 x double>) #8
6537declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>) #8
6538declare <8 x float> @llvm.fma.v8f32(<8 x float>, <8 x float>, <8 x float>) #8
6539
6540define <2 x double> @test_mm_mask_sqrt_pd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A) {
6541; X86-LABEL: test_mm_mask_sqrt_pd:
6542; X86:       # %bb.0: # %entry
6543; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
6544; X86-NEXT:    kmovw %eax, %k1
6545; X86-NEXT:    vsqrtpd %xmm1, %xmm0 {%k1}
6546; X86-NEXT:    retl
6547;
6548; X64-LABEL: test_mm_mask_sqrt_pd:
6549; X64:       # %bb.0: # %entry
6550; X64-NEXT:    kmovw %edi, %k1
6551; X64-NEXT:    vsqrtpd %xmm1, %xmm0 {%k1}
6552; X64-NEXT:    retq
6553entry:
6554  %0 = tail call <2 x double> @llvm.sqrt.v2f64(<2 x double> %__A) #2
6555  %1 = bitcast i8 %__U to <8 x i1>
6556  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
6557  %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> %__W
6558  ret <2 x double> %2
6559}
6560
6561declare <2 x double> @llvm.sqrt.v2f64(<2 x double>)
6562
6563define <2 x double> @test_mm_maskz_sqrt_pd(i8 zeroext %__U, <2 x double> %__A) {
6564; X86-LABEL: test_mm_maskz_sqrt_pd:
6565; X86:       # %bb.0: # %entry
6566; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
6567; X86-NEXT:    kmovw %eax, %k1
6568; X86-NEXT:    vsqrtpd %xmm0, %xmm0 {%k1} {z}
6569; X86-NEXT:    retl
6570;
6571; X64-LABEL: test_mm_maskz_sqrt_pd:
6572; X64:       # %bb.0: # %entry
6573; X64-NEXT:    kmovw %edi, %k1
6574; X64-NEXT:    vsqrtpd %xmm0, %xmm0 {%k1} {z}
6575; X64-NEXT:    retq
6576entry:
6577  %0 = tail call <2 x double> @llvm.sqrt.v2f64(<2 x double> %__A) #2
6578  %1 = bitcast i8 %__U to <8 x i1>
6579  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
6580  %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> zeroinitializer
6581  ret <2 x double> %2
6582}
6583
6584define <4 x double> @test_mm256_mask_sqrt_pd(<4 x double> %__W, i8 zeroext %__U, <4 x double> %__A) {
6585; X86-LABEL: test_mm256_mask_sqrt_pd:
6586; X86:       # %bb.0: # %entry
6587; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
6588; X86-NEXT:    kmovw %eax, %k1
6589; X86-NEXT:    vsqrtpd %ymm1, %ymm0 {%k1}
6590; X86-NEXT:    retl
6591;
6592; X64-LABEL: test_mm256_mask_sqrt_pd:
6593; X64:       # %bb.0: # %entry
6594; X64-NEXT:    kmovw %edi, %k1
6595; X64-NEXT:    vsqrtpd %ymm1, %ymm0 {%k1}
6596; X64-NEXT:    retq
6597entry:
6598  %0 = tail call <4 x double> @llvm.sqrt.v4f64(<4 x double> %__A) #2
6599  %1 = bitcast i8 %__U to <8 x i1>
6600  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6601  %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> %__W
6602  ret <4 x double> %2
6603}
6604
6605declare <4 x double> @llvm.sqrt.v4f64(<4 x double>)
6606
6607define <4 x double> @test_mm256_maskz_sqrt_pd(i8 zeroext %__U, <4 x double> %__A) {
6608; X86-LABEL: test_mm256_maskz_sqrt_pd:
6609; X86:       # %bb.0: # %entry
6610; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
6611; X86-NEXT:    kmovw %eax, %k1
6612; X86-NEXT:    vsqrtpd %ymm0, %ymm0 {%k1} {z}
6613; X86-NEXT:    retl
6614;
6615; X64-LABEL: test_mm256_maskz_sqrt_pd:
6616; X64:       # %bb.0: # %entry
6617; X64-NEXT:    kmovw %edi, %k1
6618; X64-NEXT:    vsqrtpd %ymm0, %ymm0 {%k1} {z}
6619; X64-NEXT:    retq
6620entry:
6621  %0 = tail call <4 x double> @llvm.sqrt.v4f64(<4 x double> %__A) #2
6622  %1 = bitcast i8 %__U to <8 x i1>
6623  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6624  %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> zeroinitializer
6625  ret <4 x double> %2
6626}
6627
6628define <4 x float> @test_mm_mask_sqrt_ps(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A) {
6629; X86-LABEL: test_mm_mask_sqrt_ps:
6630; X86:       # %bb.0: # %entry
6631; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
6632; X86-NEXT:    kmovw %eax, %k1
6633; X86-NEXT:    vsqrtps %xmm1, %xmm0 {%k1}
6634; X86-NEXT:    retl
6635;
6636; X64-LABEL: test_mm_mask_sqrt_ps:
6637; X64:       # %bb.0: # %entry
6638; X64-NEXT:    kmovw %edi, %k1
6639; X64-NEXT:    vsqrtps %xmm1, %xmm0 {%k1}
6640; X64-NEXT:    retq
6641entry:
6642  %0 = tail call <4 x float> @llvm.sqrt.v4f32(<4 x float> %__A) #2
6643  %1 = bitcast i8 %__U to <8 x i1>
6644  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6645  %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> %__W
6646  ret <4 x float> %2
6647}
6648
6649declare <4 x float> @llvm.sqrt.v4f32(<4 x float>)
6650
6651define <4 x float> @test_mm_maskz_sqrt_ps(i8 zeroext %__U, <4 x float> %__A) {
6652; X86-LABEL: test_mm_maskz_sqrt_ps:
6653; X86:       # %bb.0: # %entry
6654; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
6655; X86-NEXT:    kmovw %eax, %k1
6656; X86-NEXT:    vsqrtps %xmm0, %xmm0 {%k1} {z}
6657; X86-NEXT:    retl
6658;
6659; X64-LABEL: test_mm_maskz_sqrt_ps:
6660; X64:       # %bb.0: # %entry
6661; X64-NEXT:    kmovw %edi, %k1
6662; X64-NEXT:    vsqrtps %xmm0, %xmm0 {%k1} {z}
6663; X64-NEXT:    retq
6664entry:
6665  %0 = tail call <4 x float> @llvm.sqrt.v4f32(<4 x float> %__A) #2
6666  %1 = bitcast i8 %__U to <8 x i1>
6667  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6668  %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> zeroinitializer
6669  ret <4 x float> %2
6670}
6671
6672define <8 x float> @test_mm256_mask_sqrt_ps(<8 x float> %__W, i8 zeroext %__U, <8 x float> %__A) {
6673; X86-LABEL: test_mm256_mask_sqrt_ps:
6674; X86:       # %bb.0: # %entry
6675; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
6676; X86-NEXT:    kmovw %eax, %k1
6677; X86-NEXT:    vsqrtps %ymm1, %ymm0 {%k1}
6678; X86-NEXT:    retl
6679;
6680; X64-LABEL: test_mm256_mask_sqrt_ps:
6681; X64:       # %bb.0: # %entry
6682; X64-NEXT:    kmovw %edi, %k1
6683; X64-NEXT:    vsqrtps %ymm1, %ymm0 {%k1}
6684; X64-NEXT:    retq
6685entry:
6686  %0 = tail call <8 x float> @llvm.sqrt.v8f32(<8 x float> %__A) #2
6687  %1 = bitcast i8 %__U to <8 x i1>
6688  %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> %__W
6689  ret <8 x float> %2
6690}
6691
6692define <8 x float> @test_mm256_maskz_sqrt_ps(i8 zeroext %__U, <8 x float> %__A) {
6693; X86-LABEL: test_mm256_maskz_sqrt_ps:
6694; X86:       # %bb.0: # %entry
6695; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
6696; X86-NEXT:    kmovw %eax, %k1
6697; X86-NEXT:    vsqrtps %ymm0, %ymm0 {%k1} {z}
6698; X86-NEXT:    retl
6699;
6700; X64-LABEL: test_mm256_maskz_sqrt_ps:
6701; X64:       # %bb.0: # %entry
6702; X64-NEXT:    kmovw %edi, %k1
6703; X64-NEXT:    vsqrtps %ymm0, %ymm0 {%k1} {z}
6704; X64-NEXT:    retq
6705entry:
6706  %0 = tail call <8 x float> @llvm.sqrt.v8f32(<8 x float> %__A) #2
6707  %1 = bitcast i8 %__U to <8 x i1>
6708  %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> zeroinitializer
6709  ret <8 x float> %2
6710}
6711
6712declare <8 x float> @llvm.sqrt.v8f32(<8 x float>)
6713
6714define <2 x i64> @test_mm_rol_epi32(<2 x i64> %__A) {
6715; CHECK-LABEL: test_mm_rol_epi32:
6716; CHECK:       # %bb.0: # %entry
6717; CHECK-NEXT:    vprold $5, %xmm0, %xmm0
6718; CHECK-NEXT:    ret{{[l|q]}}
6719entry:
6720  %0 = bitcast <2 x i64> %__A to <4 x i32>
6721  %1 = tail call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %0, <4 x i32> %0, <4 x i32> <i32 5, i32 5, i32 5, i32 5>)
6722  %2 = bitcast <4 x i32> %1 to <2 x i64>
6723  ret <2 x i64> %2
6724}
6725
6726define <2 x i64> @test_mm_mask_rol_epi32(<2 x i64> %__W, i8 zeroext %__U, <2 x i64> %__A) {
6727; X86-LABEL: test_mm_mask_rol_epi32:
6728; X86:       # %bb.0: # %entry
6729; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
6730; X86-NEXT:    kmovw %eax, %k1
6731; X86-NEXT:    vprold $5, %xmm1, %xmm0 {%k1}
6732; X86-NEXT:    retl
6733;
6734; X64-LABEL: test_mm_mask_rol_epi32:
6735; X64:       # %bb.0: # %entry
6736; X64-NEXT:    kmovw %edi, %k1
6737; X64-NEXT:    vprold $5, %xmm1, %xmm0 {%k1}
6738; X64-NEXT:    retq
6739entry:
6740  %0 = bitcast <2 x i64> %__A to <4 x i32>
6741  %1 = tail call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %0, <4 x i32> %0, <4 x i32> <i32 5, i32 5, i32 5, i32 5>)
6742  %2 = bitcast <2 x i64> %__W to <4 x i32>
6743  %3 = bitcast i8 %__U to <8 x i1>
6744  %extract = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6745  %4 = select <4 x i1> %extract, <4 x i32> %1, <4 x i32> %2
6746  %5 = bitcast <4 x i32> %4 to <2 x i64>
6747  ret <2 x i64> %5
6748}
6749
6750define <2 x i64> @test_mm_maskz_rol_epi32(i8 zeroext %__U, <2 x i64> %__A) {
6751; X86-LABEL: test_mm_maskz_rol_epi32:
6752; X86:       # %bb.0: # %entry
6753; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
6754; X86-NEXT:    kmovw %eax, %k1
6755; X86-NEXT:    vprold $5, %xmm0, %xmm0 {%k1} {z}
6756; X86-NEXT:    retl
6757;
6758; X64-LABEL: test_mm_maskz_rol_epi32:
6759; X64:       # %bb.0: # %entry
6760; X64-NEXT:    kmovw %edi, %k1
6761; X64-NEXT:    vprold $5, %xmm0, %xmm0 {%k1} {z}
6762; X64-NEXT:    retq
6763entry:
6764  %0 = bitcast <2 x i64> %__A to <4 x i32>
6765  %1 = tail call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %0, <4 x i32> %0, <4 x i32> <i32 5, i32 5, i32 5, i32 5>)
6766  %2 = bitcast i8 %__U to <8 x i1>
6767  %extract = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6768  %3 = select <4 x i1> %extract, <4 x i32> %1, <4 x i32> zeroinitializer
6769  %4 = bitcast <4 x i32> %3 to <2 x i64>
6770  ret <2 x i64> %4
6771}
6772
6773define <4 x i64> @test_mm256_rol_epi32(<4 x i64> %__A) {
6774; CHECK-LABEL: test_mm256_rol_epi32:
6775; CHECK:       # %bb.0: # %entry
6776; CHECK-NEXT:    vprold $5, %ymm0, %ymm0
6777; CHECK-NEXT:    ret{{[l|q]}}
6778entry:
6779  %0 = bitcast <4 x i64> %__A to <8 x i32>
6780  %1 = tail call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %0, <8 x i32> %0, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
6781  %2 = bitcast <8 x i32> %1 to <4 x i64>
6782  ret <4 x i64> %2
6783}
6784
6785define <4 x i64> @test_mm256_mask_rol_epi32(<4 x i64> %__W, i8 zeroext %__U, <4 x i64> %__A) {
6786; X86-LABEL: test_mm256_mask_rol_epi32:
6787; X86:       # %bb.0: # %entry
6788; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
6789; X86-NEXT:    kmovw %eax, %k1
6790; X86-NEXT:    vprold $5, %ymm1, %ymm0 {%k1}
6791; X86-NEXT:    retl
6792;
6793; X64-LABEL: test_mm256_mask_rol_epi32:
6794; X64:       # %bb.0: # %entry
6795; X64-NEXT:    kmovw %edi, %k1
6796; X64-NEXT:    vprold $5, %ymm1, %ymm0 {%k1}
6797; X64-NEXT:    retq
6798entry:
6799  %0 = bitcast <4 x i64> %__A to <8 x i32>
6800  %1 = tail call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %0, <8 x i32> %0, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
6801  %2 = bitcast <4 x i64> %__W to <8 x i32>
6802  %3 = bitcast i8 %__U to <8 x i1>
6803  %4 = select <8 x i1> %3, <8 x i32> %1, <8 x i32> %2
6804  %5 = bitcast <8 x i32> %4 to <4 x i64>
6805  ret <4 x i64> %5
6806}
6807
6808define <4 x i64> @test_mm256_maskz_rol_epi32(i8 zeroext %__U, <4 x i64> %__A) {
6809; X86-LABEL: test_mm256_maskz_rol_epi32:
6810; X86:       # %bb.0: # %entry
6811; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
6812; X86-NEXT:    kmovw %eax, %k1
6813; X86-NEXT:    vprold $5, %ymm0, %ymm0 {%k1} {z}
6814; X86-NEXT:    retl
6815;
6816; X64-LABEL: test_mm256_maskz_rol_epi32:
6817; X64:       # %bb.0: # %entry
6818; X64-NEXT:    kmovw %edi, %k1
6819; X64-NEXT:    vprold $5, %ymm0, %ymm0 {%k1} {z}
6820; X64-NEXT:    retq
6821entry:
6822  %0 = bitcast <4 x i64> %__A to <8 x i32>
6823  %1 = tail call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %0, <8 x i32> %0, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
6824  %2 = bitcast i8 %__U to <8 x i1>
6825  %3 = select <8 x i1> %2, <8 x i32> %1, <8 x i32> zeroinitializer
6826  %4 = bitcast <8 x i32> %3 to <4 x i64>
6827  ret <4 x i64> %4
6828}
6829
6830define <2 x i64> @test_mm_rol_epi64(<2 x i64> %__A) {
6831; CHECK-LABEL: test_mm_rol_epi64:
6832; CHECK:       # %bb.0: # %entry
6833; CHECK-NEXT:    vprolq $5, %xmm0, %xmm0
6834; CHECK-NEXT:    ret{{[l|q]}}
6835entry:
6836  %0 = tail call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %__A, <2 x i64> %__A, <2 x i64> <i64 5, i64 5>)
6837  ret <2 x i64> %0
6838}
6839
6840define <2 x i64> @test_mm_mask_rol_epi64(<2 x i64> %__W, i8 zeroext %__U, <2 x i64> %__A) {
6841; X86-LABEL: test_mm_mask_rol_epi64:
6842; X86:       # %bb.0: # %entry
6843; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
6844; X86-NEXT:    kmovw %eax, %k1
6845; X86-NEXT:    vprolq $5, %xmm1, %xmm0 {%k1}
6846; X86-NEXT:    retl
6847;
6848; X64-LABEL: test_mm_mask_rol_epi64:
6849; X64:       # %bb.0: # %entry
6850; X64-NEXT:    kmovw %edi, %k1
6851; X64-NEXT:    vprolq $5, %xmm1, %xmm0 {%k1}
6852; X64-NEXT:    retq
6853entry:
6854  %0 = tail call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %__A, <2 x i64> %__A, <2 x i64> <i64 5, i64 5>)
6855  %1 = bitcast i8 %__U to <8 x i1>
6856  %extract = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
6857  %2 = select <2 x i1> %extract, <2 x i64> %0, <2 x i64> %__W
6858  ret <2 x i64> %2
6859}
6860
6861define <2 x i64> @test_mm_maskz_rol_epi64(i8 zeroext %__U, <2 x i64> %__A) {
6862; X86-LABEL: test_mm_maskz_rol_epi64:
6863; X86:       # %bb.0: # %entry
6864; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
6865; X86-NEXT:    kmovw %eax, %k1
6866; X86-NEXT:    vprolq $5, %xmm0, %xmm0 {%k1} {z}
6867; X86-NEXT:    retl
6868;
6869; X64-LABEL: test_mm_maskz_rol_epi64:
6870; X64:       # %bb.0: # %entry
6871; X64-NEXT:    kmovw %edi, %k1
6872; X64-NEXT:    vprolq $5, %xmm0, %xmm0 {%k1} {z}
6873; X64-NEXT:    retq
6874entry:
6875  %0 = tail call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %__A, <2 x i64> %__A, <2 x i64> <i64 5, i64 5>)
6876  %1 = bitcast i8 %__U to <8 x i1>
6877  %extract = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
6878  %2 = select <2 x i1> %extract, <2 x i64> %0, <2 x i64> zeroinitializer
6879  ret <2 x i64> %2
6880}
6881
6882define <4 x i64> @test_mm256_rol_epi64(<4 x i64> %__A) {
6883; CHECK-LABEL: test_mm256_rol_epi64:
6884; CHECK:       # %bb.0: # %entry
6885; CHECK-NEXT:    vprolq $5, %ymm0, %ymm0
6886; CHECK-NEXT:    ret{{[l|q]}}
6887entry:
6888  %0 = tail call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %__A, <4 x i64> %__A, <4 x i64> <i64 5, i64 5,i64 5, i64 5>)
6889  ret <4 x i64> %0
6890}
6891
6892define <4 x i64> @test_mm256_mask_rol_epi64(<4 x i64> %__W, i8 zeroext %__U, <4 x i64> %__A) {
6893; X86-LABEL: test_mm256_mask_rol_epi64:
6894; X86:       # %bb.0: # %entry
6895; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
6896; X86-NEXT:    kmovw %eax, %k1
6897; X86-NEXT:    vprolq $5, %ymm1, %ymm0 {%k1}
6898; X86-NEXT:    retl
6899;
6900; X64-LABEL: test_mm256_mask_rol_epi64:
6901; X64:       # %bb.0: # %entry
6902; X64-NEXT:    kmovw %edi, %k1
6903; X64-NEXT:    vprolq $5, %ymm1, %ymm0 {%k1}
6904; X64-NEXT:    retq
6905entry:
6906  %0 = tail call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %__A, <4 x i64> %__A, <4 x i64> <i64 5, i64 5,i64 5, i64 5>)
6907  %1 = bitcast i8 %__U to <8 x i1>
6908  %extract = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6909  %2 = select <4 x i1> %extract, <4 x i64> %0, <4 x i64> %__W
6910  ret <4 x i64> %2
6911}
6912
6913define <4 x i64> @test_mm256_maskz_rol_epi64(i8 zeroext %__U, <4 x i64> %__A) {
6914; X86-LABEL: test_mm256_maskz_rol_epi64:
6915; X86:       # %bb.0: # %entry
6916; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
6917; X86-NEXT:    kmovw %eax, %k1
6918; X86-NEXT:    vprolq $5, %ymm0, %ymm0 {%k1} {z}
6919; X86-NEXT:    retl
6920;
6921; X64-LABEL: test_mm256_maskz_rol_epi64:
6922; X64:       # %bb.0: # %entry
6923; X64-NEXT:    kmovw %edi, %k1
6924; X64-NEXT:    vprolq $5, %ymm0, %ymm0 {%k1} {z}
6925; X64-NEXT:    retq
6926entry:
6927  %0 = tail call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %__A, <4 x i64> %__A, <4 x i64> <i64 5, i64 5,i64 5, i64 5>)
6928  %1 = bitcast i8 %__U to <8 x i1>
6929  %extract = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6930  %2 = select <4 x i1> %extract, <4 x i64> %0, <4 x i64> zeroinitializer
6931  ret <4 x i64> %2
6932}
6933
6934define <2 x i64> @test_mm_rolv_epi32(<2 x i64> %__A, <2 x i64> %__B) {
6935; CHECK-LABEL: test_mm_rolv_epi32:
6936; CHECK:       # %bb.0: # %entry
6937; CHECK-NEXT:    vprolvd %xmm1, %xmm0, %xmm0
6938; CHECK-NEXT:    ret{{[l|q]}}
6939entry:
6940  %0 = bitcast <2 x i64> %__A to <4 x i32>
6941  %1 = bitcast <2 x i64> %__B to <4 x i32>
6942  %2 = tail call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %0, <4 x i32> %0, <4 x i32> %1)
6943  %3 = bitcast <4 x i32> %2 to <2 x i64>
6944  ret <2 x i64> %3
6945}
6946
6947define <2 x i64> @test_mm_mask_rolv_epi32(<2 x i64> %__W, i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) {
6948; X86-LABEL: test_mm_mask_rolv_epi32:
6949; X86:       # %bb.0: # %entry
6950; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
6951; X86-NEXT:    kmovw %eax, %k1
6952; X86-NEXT:    vprolvd %xmm2, %xmm1, %xmm0 {%k1}
6953; X86-NEXT:    retl
6954;
6955; X64-LABEL: test_mm_mask_rolv_epi32:
6956; X64:       # %bb.0: # %entry
6957; X64-NEXT:    kmovw %edi, %k1
6958; X64-NEXT:    vprolvd %xmm2, %xmm1, %xmm0 {%k1}
6959; X64-NEXT:    retq
6960entry:
6961  %0 = bitcast <2 x i64> %__A to <4 x i32>
6962  %1 = bitcast <2 x i64> %__B to <4 x i32>
6963  %2 = tail call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %0, <4 x i32> %0, <4 x i32> %1)
6964  %3 = bitcast <2 x i64> %__W to <4 x i32>
6965  %4 = bitcast i8 %__U to <8 x i1>
6966  %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6967  %5 = select <4 x i1> %extract.i, <4 x i32> %2, <4 x i32> %3
6968  %6 = bitcast <4 x i32> %5 to <2 x i64>
6969  ret <2 x i64> %6
6970}
6971
6972define <2 x i64> @test_mm_maskz_rolv_epi32(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) {
6973; X86-LABEL: test_mm_maskz_rolv_epi32:
6974; X86:       # %bb.0: # %entry
6975; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
6976; X86-NEXT:    kmovw %eax, %k1
6977; X86-NEXT:    vprolvd %xmm1, %xmm0, %xmm0 {%k1} {z}
6978; X86-NEXT:    retl
6979;
6980; X64-LABEL: test_mm_maskz_rolv_epi32:
6981; X64:       # %bb.0: # %entry
6982; X64-NEXT:    kmovw %edi, %k1
6983; X64-NEXT:    vprolvd %xmm1, %xmm0, %xmm0 {%k1} {z}
6984; X64-NEXT:    retq
6985entry:
6986  %0 = bitcast <2 x i64> %__A to <4 x i32>
6987  %1 = bitcast <2 x i64> %__B to <4 x i32>
6988  %2 = tail call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %0, <4 x i32> %0, <4 x i32> %1)
6989  %3 = bitcast i8 %__U to <8 x i1>
6990  %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6991  %4 = select <4 x i1> %extract.i, <4 x i32> %2, <4 x i32> zeroinitializer
6992  %5 = bitcast <4 x i32> %4 to <2 x i64>
6993  ret <2 x i64> %5
6994}
6995
6996define <4 x i64> @test_mm256_rolv_epi32(<4 x i64> %__A, <4 x i64> %__B) {
6997; CHECK-LABEL: test_mm256_rolv_epi32:
6998; CHECK:       # %bb.0: # %entry
6999; CHECK-NEXT:    vprolvd %ymm1, %ymm0, %ymm0
7000; CHECK-NEXT:    ret{{[l|q]}}
7001entry:
7002  %0 = bitcast <4 x i64> %__A to <8 x i32>
7003  %1 = bitcast <4 x i64> %__B to <8 x i32>
7004  %2 = tail call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %0, <8 x i32> %0, <8 x i32> %1)
7005  %3 = bitcast <8 x i32> %2 to <4 x i64>
7006  ret <4 x i64> %3
7007}
7008
7009define <4 x i64> @test_mm256_mask_rolv_epi32(<4 x i64> %__W, i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
7010; X86-LABEL: test_mm256_mask_rolv_epi32:
7011; X86:       # %bb.0: # %entry
7012; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
7013; X86-NEXT:    kmovw %eax, %k1
7014; X86-NEXT:    vprolvd %ymm2, %ymm1, %ymm0 {%k1}
7015; X86-NEXT:    retl
7016;
7017; X64-LABEL: test_mm256_mask_rolv_epi32:
7018; X64:       # %bb.0: # %entry
7019; X64-NEXT:    kmovw %edi, %k1
7020; X64-NEXT:    vprolvd %ymm2, %ymm1, %ymm0 {%k1}
7021; X64-NEXT:    retq
7022entry:
7023  %0 = bitcast <4 x i64> %__A to <8 x i32>
7024  %1 = bitcast <4 x i64> %__B to <8 x i32>
7025  %2 = tail call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %0, <8 x i32> %0, <8 x i32> %1)
7026  %3 = bitcast <4 x i64> %__W to <8 x i32>
7027  %4 = bitcast i8 %__U to <8 x i1>
7028  %5 = select <8 x i1> %4, <8 x i32> %2, <8 x i32> %3
7029  %6 = bitcast <8 x i32> %5 to <4 x i64>
7030  ret <4 x i64> %6
7031}
7032
7033define <4 x i64> @test_mm256_maskz_rolv_epi32(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
7034; X86-LABEL: test_mm256_maskz_rolv_epi32:
7035; X86:       # %bb.0: # %entry
7036; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
7037; X86-NEXT:    kmovw %eax, %k1
7038; X86-NEXT:    vprolvd %ymm1, %ymm0, %ymm0 {%k1} {z}
7039; X86-NEXT:    retl
7040;
7041; X64-LABEL: test_mm256_maskz_rolv_epi32:
7042; X64:       # %bb.0: # %entry
7043; X64-NEXT:    kmovw %edi, %k1
7044; X64-NEXT:    vprolvd %ymm1, %ymm0, %ymm0 {%k1} {z}
7045; X64-NEXT:    retq
7046entry:
7047  %0 = bitcast <4 x i64> %__A to <8 x i32>
7048  %1 = bitcast <4 x i64> %__B to <8 x i32>
7049  %2 = tail call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %0, <8 x i32> %0, <8 x i32> %1)
7050  %3 = bitcast i8 %__U to <8 x i1>
7051  %4 = select <8 x i1> %3, <8 x i32> %2, <8 x i32> zeroinitializer
7052  %5 = bitcast <8 x i32> %4 to <4 x i64>
7053  ret <4 x i64> %5
7054}
7055
7056define <2 x i64> @test_mm_rolv_epi64(<2 x i64> %__A, <2 x i64> %__B) {
7057; CHECK-LABEL: test_mm_rolv_epi64:
7058; CHECK:       # %bb.0: # %entry
7059; CHECK-NEXT:    vprolvq %xmm1, %xmm0, %xmm0
7060; CHECK-NEXT:    ret{{[l|q]}}
7061entry:
7062  %0 = tail call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %__A, <2 x i64> %__A, <2 x i64> %__B)
7063  ret <2 x i64> %0
7064}
7065
7066define <2 x i64> @test_mm_mask_rolv_epi64(<2 x i64> %__W, i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) {
7067; X86-LABEL: test_mm_mask_rolv_epi64:
7068; X86:       # %bb.0: # %entry
7069; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
7070; X86-NEXT:    kmovw %eax, %k1
7071; X86-NEXT:    vprolvq %xmm2, %xmm1, %xmm0 {%k1}
7072; X86-NEXT:    retl
7073;
7074; X64-LABEL: test_mm_mask_rolv_epi64:
7075; X64:       # %bb.0: # %entry
7076; X64-NEXT:    kmovw %edi, %k1
7077; X64-NEXT:    vprolvq %xmm2, %xmm1, %xmm0 {%k1}
7078; X64-NEXT:    retq
7079entry:
7080  %0 = tail call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %__A, <2 x i64> %__A, <2 x i64> %__B)
7081  %1 = bitcast i8 %__U to <8 x i1>
7082  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
7083  %2 = select <2 x i1> %extract.i, <2 x i64> %0, <2 x i64> %__W
7084  ret <2 x i64> %2
7085}
7086
7087define <2 x i64> @test_mm_maskz_rolv_epi64(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) {
7088; X86-LABEL: test_mm_maskz_rolv_epi64:
7089; X86:       # %bb.0: # %entry
7090; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
7091; X86-NEXT:    kmovw %eax, %k1
7092; X86-NEXT:    vprolvq %xmm1, %xmm0, %xmm0 {%k1} {z}
7093; X86-NEXT:    retl
7094;
7095; X64-LABEL: test_mm_maskz_rolv_epi64:
7096; X64:       # %bb.0: # %entry
7097; X64-NEXT:    kmovw %edi, %k1
7098; X64-NEXT:    vprolvq %xmm1, %xmm0, %xmm0 {%k1} {z}
7099; X64-NEXT:    retq
7100entry:
7101  %0 = tail call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %__A, <2 x i64> %__A, <2 x i64> %__B)
7102  %1 = bitcast i8 %__U to <8 x i1>
7103  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
7104  %2 = select <2 x i1> %extract.i, <2 x i64> %0, <2 x i64> zeroinitializer
7105  ret <2 x i64> %2
7106}
7107
7108define <4 x i64> @test_mm256_rolv_epi64(<4 x i64> %__A, <4 x i64> %__B) {
7109; CHECK-LABEL: test_mm256_rolv_epi64:
7110; CHECK:       # %bb.0: # %entry
7111; CHECK-NEXT:    vprolvq %ymm1, %ymm0, %ymm0
7112; CHECK-NEXT:    ret{{[l|q]}}
7113entry:
7114  %0 = tail call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %__A, <4 x i64> %__A, <4 x i64> %__B)
7115  ret <4 x i64> %0
7116}
7117
7118define <4 x i64> @test_mm256_mask_rolv_epi64(<4 x i64> %__W, i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
7119; X86-LABEL: test_mm256_mask_rolv_epi64:
7120; X86:       # %bb.0: # %entry
7121; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
7122; X86-NEXT:    kmovw %eax, %k1
7123; X86-NEXT:    vprolvq %ymm2, %ymm1, %ymm0 {%k1}
7124; X86-NEXT:    retl
7125;
7126; X64-LABEL: test_mm256_mask_rolv_epi64:
7127; X64:       # %bb.0: # %entry
7128; X64-NEXT:    kmovw %edi, %k1
7129; X64-NEXT:    vprolvq %ymm2, %ymm1, %ymm0 {%k1}
7130; X64-NEXT:    retq
7131entry:
7132  %0 = tail call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %__A, <4 x i64> %__A, <4 x i64> %__B)
7133  %1 = bitcast i8 %__U to <8 x i1>
7134  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7135  %2 = select <4 x i1> %extract.i, <4 x i64> %0, <4 x i64> %__W
7136  ret <4 x i64> %2
7137}
7138
7139define <4 x i64> @test_mm256_maskz_rolv_epi64(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
7140; X86-LABEL: test_mm256_maskz_rolv_epi64:
7141; X86:       # %bb.0: # %entry
7142; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
7143; X86-NEXT:    kmovw %eax, %k1
7144; X86-NEXT:    vprolvq %ymm1, %ymm0, %ymm0 {%k1} {z}
7145; X86-NEXT:    retl
7146;
7147; X64-LABEL: test_mm256_maskz_rolv_epi64:
7148; X64:       # %bb.0: # %entry
7149; X64-NEXT:    kmovw %edi, %k1
7150; X64-NEXT:    vprolvq %ymm1, %ymm0, %ymm0 {%k1} {z}
7151; X64-NEXT:    retq
7152entry:
7153  %0 = tail call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %__A, <4 x i64> %__A, <4 x i64> %__B)
7154  %1 = bitcast i8 %__U to <8 x i1>
7155  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7156  %2 = select <4 x i1> %extract.i, <4 x i64> %0, <4 x i64> zeroinitializer
7157  ret <4 x i64> %2
7158}
7159
7160define <2 x i64> @test_mm_ror_epi32(<2 x i64> %__A) {
7161; CHECK-LABEL: test_mm_ror_epi32:
7162; CHECK:       # %bb.0: # %entry
7163; CHECK-NEXT:    vprord $5, %xmm0, %xmm0
7164; CHECK-NEXT:    ret{{[l|q]}}
7165entry:
7166  %0 = bitcast <2 x i64> %__A to <4 x i32>
7167  %1 = tail call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %0, <4 x i32> %0, <4 x i32> <i32 5, i32 5, i32 5, i32 5>)
7168  %2 = bitcast <4 x i32> %1 to <2 x i64>
7169  ret <2 x i64> %2
7170}
7171
7172define <2 x i64> @test_mm_mask_ror_epi32(<2 x i64> %__W, i8 zeroext %__U, <2 x i64> %__A) {
7173; X86-LABEL: test_mm_mask_ror_epi32:
7174; X86:       # %bb.0: # %entry
7175; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
7176; X86-NEXT:    kmovw %eax, %k1
7177; X86-NEXT:    vprord $5, %xmm1, %xmm0 {%k1}
7178; X86-NEXT:    retl
7179;
7180; X64-LABEL: test_mm_mask_ror_epi32:
7181; X64:       # %bb.0: # %entry
7182; X64-NEXT:    kmovw %edi, %k1
7183; X64-NEXT:    vprord $5, %xmm1, %xmm0 {%k1}
7184; X64-NEXT:    retq
7185entry:
7186  %0 = bitcast <2 x i64> %__A to <4 x i32>
7187  %1 = tail call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %0, <4 x i32> %0, <4 x i32> <i32 5, i32 5, i32 5, i32 5>)
7188  %2 = bitcast <2 x i64> %__W to <4 x i32>
7189  %3 = bitcast i8 %__U to <8 x i1>
7190  %extract = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7191  %4 = select <4 x i1> %extract, <4 x i32> %1, <4 x i32> %2
7192  %5 = bitcast <4 x i32> %4 to <2 x i64>
7193  ret <2 x i64> %5
7194}
7195
7196define <2 x i64> @test_mm_maskz_ror_epi32(i8 zeroext %__U, <2 x i64> %__A) {
7197; X86-LABEL: test_mm_maskz_ror_epi32:
7198; X86:       # %bb.0: # %entry
7199; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
7200; X86-NEXT:    kmovw %eax, %k1
7201; X86-NEXT:    vprord $5, %xmm0, %xmm0 {%k1} {z}
7202; X86-NEXT:    retl
7203;
7204; X64-LABEL: test_mm_maskz_ror_epi32:
7205; X64:       # %bb.0: # %entry
7206; X64-NEXT:    kmovw %edi, %k1
7207; X64-NEXT:    vprord $5, %xmm0, %xmm0 {%k1} {z}
7208; X64-NEXT:    retq
7209entry:
7210  %0 = bitcast <2 x i64> %__A to <4 x i32>
7211  %1 = tail call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %0, <4 x i32> %0, <4 x i32> <i32 5, i32 5, i32 5, i32 5>)
7212  %2 = bitcast i8 %__U to <8 x i1>
7213  %extract = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7214  %3 = select <4 x i1> %extract, <4 x i32> %1, <4 x i32> zeroinitializer
7215  %4 = bitcast <4 x i32> %3 to <2 x i64>
7216  ret <2 x i64> %4
7217}
7218
7219define <4 x i64> @test_mm256_ror_epi32(<4 x i64> %__A) {
7220; CHECK-LABEL: test_mm256_ror_epi32:
7221; CHECK:       # %bb.0: # %entry
7222; CHECK-NEXT:    vprord $5, %ymm0, %ymm0
7223; CHECK-NEXT:    ret{{[l|q]}}
7224entry:
7225  %0 = bitcast <4 x i64> %__A to <8 x i32>
7226  %1 = tail call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %0, <8 x i32> %0, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
7227  %2 = bitcast <8 x i32> %1 to <4 x i64>
7228  ret <4 x i64> %2
7229}
7230
7231define <4 x i64> @test_mm256_mask_ror_epi32(<4 x i64> %__W, i8 zeroext %__U, <4 x i64> %__A) {
7232; X86-LABEL: test_mm256_mask_ror_epi32:
7233; X86:       # %bb.0: # %entry
7234; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
7235; X86-NEXT:    kmovw %eax, %k1
7236; X86-NEXT:    vprord $5, %ymm1, %ymm0 {%k1}
7237; X86-NEXT:    retl
7238;
7239; X64-LABEL: test_mm256_mask_ror_epi32:
7240; X64:       # %bb.0: # %entry
7241; X64-NEXT:    kmovw %edi, %k1
7242; X64-NEXT:    vprord $5, %ymm1, %ymm0 {%k1}
7243; X64-NEXT:    retq
7244entry:
7245  %0 = bitcast <4 x i64> %__A to <8 x i32>
7246  %1 = tail call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %0, <8 x i32> %0, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
7247  %2 = bitcast <4 x i64> %__W to <8 x i32>
7248  %3 = bitcast i8 %__U to <8 x i1>
7249  %4 = select <8 x i1> %3, <8 x i32> %1, <8 x i32> %2
7250  %5 = bitcast <8 x i32> %4 to <4 x i64>
7251  ret <4 x i64> %5
7252}
7253
7254define <4 x i64> @test_mm256_maskz_ror_epi32(i8 zeroext %__U, <4 x i64> %__A) {
7255; X86-LABEL: test_mm256_maskz_ror_epi32:
7256; X86:       # %bb.0: # %entry
7257; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
7258; X86-NEXT:    kmovw %eax, %k1
7259; X86-NEXT:    vprord $5, %ymm0, %ymm0 {%k1} {z}
7260; X86-NEXT:    retl
7261;
7262; X64-LABEL: test_mm256_maskz_ror_epi32:
7263; X64:       # %bb.0: # %entry
7264; X64-NEXT:    kmovw %edi, %k1
7265; X64-NEXT:    vprord $5, %ymm0, %ymm0 {%k1} {z}
7266; X64-NEXT:    retq
7267entry:
7268  %0 = bitcast <4 x i64> %__A to <8 x i32>
7269  %1 = tail call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %0, <8 x i32> %0, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
7270  %2 = bitcast i8 %__U to <8 x i1>
7271  %3 = select <8 x i1> %2, <8 x i32> %1, <8 x i32> zeroinitializer
7272  %4 = bitcast <8 x i32> %3 to <4 x i64>
7273  ret <4 x i64> %4
7274}
7275
7276define <2 x i64> @test_mm_ror_epi64(<2 x i64> %__A) {
7277; CHECK-LABEL: test_mm_ror_epi64:
7278; CHECK:       # %bb.0: # %entry
7279; CHECK-NEXT:    vprorq $5, %xmm0, %xmm0
7280; CHECK-NEXT:    ret{{[l|q]}}
7281entry:
7282  %0 = tail call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %__A, <2 x i64> %__A, <2 x i64> <i64 5, i64 5>)
7283  ret <2 x i64> %0
7284}
7285
7286define <2 x i64> @test_mm_mask_ror_epi64(<2 x i64> %__W, i8 zeroext %__U, <2 x i64> %__A) {
7287; X86-LABEL: test_mm_mask_ror_epi64:
7288; X86:       # %bb.0: # %entry
7289; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
7290; X86-NEXT:    kmovw %eax, %k1
7291; X86-NEXT:    vprorq $5, %xmm1, %xmm0 {%k1}
7292; X86-NEXT:    retl
7293;
7294; X64-LABEL: test_mm_mask_ror_epi64:
7295; X64:       # %bb.0: # %entry
7296; X64-NEXT:    kmovw %edi, %k1
7297; X64-NEXT:    vprorq $5, %xmm1, %xmm0 {%k1}
7298; X64-NEXT:    retq
7299entry:
7300  %0 = tail call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %__A, <2 x i64> %__A, <2 x i64> <i64 5, i64 5>)
7301  %1 = bitcast i8 %__U to <8 x i1>
7302  %extract = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
7303  %2 = select <2 x i1> %extract, <2 x i64> %0, <2 x i64> %__W
7304  ret <2 x i64> %2
7305}
7306
7307define <2 x i64> @test_mm_maskz_ror_epi64(i8 zeroext %__U, <2 x i64> %__A) {
7308; X86-LABEL: test_mm_maskz_ror_epi64:
7309; X86:       # %bb.0: # %entry
7310; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
7311; X86-NEXT:    kmovw %eax, %k1
7312; X86-NEXT:    vprorq $5, %xmm0, %xmm0 {%k1} {z}
7313; X86-NEXT:    retl
7314;
7315; X64-LABEL: test_mm_maskz_ror_epi64:
7316; X64:       # %bb.0: # %entry
7317; X64-NEXT:    kmovw %edi, %k1
7318; X64-NEXT:    vprorq $5, %xmm0, %xmm0 {%k1} {z}
7319; X64-NEXT:    retq
7320entry:
7321  %0 = tail call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %__A, <2 x i64> %__A, <2 x i64> <i64 5, i64 5>)
7322  %1 = bitcast i8 %__U to <8 x i1>
7323  %extract = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
7324  %2 = select <2 x i1> %extract, <2 x i64> %0, <2 x i64> zeroinitializer
7325  ret <2 x i64> %2
7326}
7327
7328define <4 x i64> @test_mm256_ror_epi64(<4 x i64> %__A) {
7329; CHECK-LABEL: test_mm256_ror_epi64:
7330; CHECK:       # %bb.0: # %entry
7331; CHECK-NEXT:    vprorq $5, %ymm0, %ymm0
7332; CHECK-NEXT:    ret{{[l|q]}}
7333entry:
7334  %0 = tail call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %__A, <4 x i64> %__A, <4 x i64> <i64 5, i64 5, i64 5, i64 5>)
7335  ret <4 x i64> %0
7336}
7337
7338define <4 x i64> @test_mm256_mask_ror_epi64(<4 x i64> %__W, i8 zeroext %__U, <4 x i64> %__A) {
7339; X86-LABEL: test_mm256_mask_ror_epi64:
7340; X86:       # %bb.0: # %entry
7341; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
7342; X86-NEXT:    kmovw %eax, %k1
7343; X86-NEXT:    vprorq $5, %ymm1, %ymm0 {%k1}
7344; X86-NEXT:    retl
7345;
7346; X64-LABEL: test_mm256_mask_ror_epi64:
7347; X64:       # %bb.0: # %entry
7348; X64-NEXT:    kmovw %edi, %k1
7349; X64-NEXT:    vprorq $5, %ymm1, %ymm0 {%k1}
7350; X64-NEXT:    retq
7351entry:
7352  %0 = tail call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %__A, <4 x i64> %__A, <4 x i64> <i64 5, i64 5, i64 5, i64 5>)
7353  %1 = bitcast i8 %__U to <8 x i1>
7354  %extract = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7355  %2 = select <4 x i1> %extract, <4 x i64> %0, <4 x i64> %__W
7356  ret <4 x i64> %2
7357}
7358
7359define <4 x i64> @test_mm256_maskz_ror_epi64(i8 zeroext %__U, <4 x i64> %__A) {
7360; X86-LABEL: test_mm256_maskz_ror_epi64:
7361; X86:       # %bb.0: # %entry
7362; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
7363; X86-NEXT:    kmovw %eax, %k1
7364; X86-NEXT:    vprorq $5, %ymm0, %ymm0 {%k1} {z}
7365; X86-NEXT:    retl
7366;
7367; X64-LABEL: test_mm256_maskz_ror_epi64:
7368; X64:       # %bb.0: # %entry
7369; X64-NEXT:    kmovw %edi, %k1
7370; X64-NEXT:    vprorq $5, %ymm0, %ymm0 {%k1} {z}
7371; X64-NEXT:    retq
7372entry:
7373  %0 = tail call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %__A, <4 x i64> %__A, <4 x i64> <i64 5, i64 5, i64 5, i64 5>)
7374  %1 = bitcast i8 %__U to <8 x i1>
7375  %extract = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7376  %2 = select <4 x i1> %extract, <4 x i64> %0, <4 x i64> zeroinitializer
7377  ret <4 x i64> %2
7378}
7379
7380define <2 x i64> @test_mm_rorv_epi32(<2 x i64> %__A, <2 x i64> %__B) {
7381; CHECK-LABEL: test_mm_rorv_epi32:
7382; CHECK:       # %bb.0: # %entry
7383; CHECK-NEXT:    vprorvd %xmm1, %xmm0, %xmm0
7384; CHECK-NEXT:    ret{{[l|q]}}
7385entry:
7386  %0 = bitcast <2 x i64> %__A to <4 x i32>
7387  %1 = bitcast <2 x i64> %__B to <4 x i32>
7388  %2 = tail call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %0, <4 x i32> %0, <4 x i32> %1)
7389  %3 = bitcast <4 x i32> %2 to <2 x i64>
7390  ret <2 x i64> %3
7391}
7392
7393define <2 x i64> @test_mm_mask_rorv_epi32(<2 x i64> %__W, i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) {
7394; X86-LABEL: test_mm_mask_rorv_epi32:
7395; X86:       # %bb.0: # %entry
7396; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
7397; X86-NEXT:    kmovw %eax, %k1
7398; X86-NEXT:    vprorvd %xmm2, %xmm1, %xmm0 {%k1}
7399; X86-NEXT:    retl
7400;
7401; X64-LABEL: test_mm_mask_rorv_epi32:
7402; X64:       # %bb.0: # %entry
7403; X64-NEXT:    kmovw %edi, %k1
7404; X64-NEXT:    vprorvd %xmm2, %xmm1, %xmm0 {%k1}
7405; X64-NEXT:    retq
7406entry:
7407  %0 = bitcast <2 x i64> %__A to <4 x i32>
7408  %1 = bitcast <2 x i64> %__B to <4 x i32>
7409  %2 = tail call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %0, <4 x i32> %0, <4 x i32> %1)
7410  %3 = bitcast <2 x i64> %__W to <4 x i32>
7411  %4 = bitcast i8 %__U to <8 x i1>
7412  %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7413  %5 = select <4 x i1> %extract.i, <4 x i32> %2, <4 x i32> %3
7414  %6 = bitcast <4 x i32> %5 to <2 x i64>
7415  ret <2 x i64> %6
7416}
7417
7418define <2 x i64> @test_mm_maskz_rorv_epi32(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) {
7419; X86-LABEL: test_mm_maskz_rorv_epi32:
7420; X86:       # %bb.0: # %entry
7421; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
7422; X86-NEXT:    kmovw %eax, %k1
7423; X86-NEXT:    vprorvd %xmm1, %xmm0, %xmm0 {%k1} {z}
7424; X86-NEXT:    retl
7425;
7426; X64-LABEL: test_mm_maskz_rorv_epi32:
7427; X64:       # %bb.0: # %entry
7428; X64-NEXT:    kmovw %edi, %k1
7429; X64-NEXT:    vprorvd %xmm1, %xmm0, %xmm0 {%k1} {z}
7430; X64-NEXT:    retq
7431entry:
7432  %0 = bitcast <2 x i64> %__A to <4 x i32>
7433  %1 = bitcast <2 x i64> %__B to <4 x i32>
7434  %2 = tail call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %0, <4 x i32> %0, <4 x i32> %1)
7435  %3 = bitcast i8 %__U to <8 x i1>
7436  %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7437  %4 = select <4 x i1> %extract.i, <4 x i32> %2, <4 x i32> zeroinitializer
7438  %5 = bitcast <4 x i32> %4 to <2 x i64>
7439  ret <2 x i64> %5
7440}
7441
7442define <4 x i64> @test_mm256_rorv_epi32(<4 x i64> %__A, <4 x i64> %__B) {
7443; CHECK-LABEL: test_mm256_rorv_epi32:
7444; CHECK:       # %bb.0: # %entry
7445; CHECK-NEXT:    vprorvd %ymm1, %ymm0, %ymm0
7446; CHECK-NEXT:    ret{{[l|q]}}
7447entry:
7448  %0 = bitcast <4 x i64> %__A to <8 x i32>
7449  %1 = bitcast <4 x i64> %__B to <8 x i32>
7450  %2 = tail call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %0, <8 x i32> %0, <8 x i32> %1)
7451  %3 = bitcast <8 x i32> %2 to <4 x i64>
7452  ret <4 x i64> %3
7453}
7454
7455define <4 x i64> @test_mm256_mask_rorv_epi32(<4 x i64> %__W, i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
7456; X86-LABEL: test_mm256_mask_rorv_epi32:
7457; X86:       # %bb.0: # %entry
7458; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
7459; X86-NEXT:    kmovw %eax, %k1
7460; X86-NEXT:    vprorvd %ymm2, %ymm1, %ymm0 {%k1}
7461; X86-NEXT:    retl
7462;
7463; X64-LABEL: test_mm256_mask_rorv_epi32:
7464; X64:       # %bb.0: # %entry
7465; X64-NEXT:    kmovw %edi, %k1
7466; X64-NEXT:    vprorvd %ymm2, %ymm1, %ymm0 {%k1}
7467; X64-NEXT:    retq
7468entry:
7469  %0 = bitcast <4 x i64> %__A to <8 x i32>
7470  %1 = bitcast <4 x i64> %__B to <8 x i32>
7471  %2 = tail call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %0, <8 x i32> %0, <8 x i32> %1)
7472  %3 = bitcast <4 x i64> %__W to <8 x i32>
7473  %4 = bitcast i8 %__U to <8 x i1>
7474  %5 = select <8 x i1> %4, <8 x i32> %2, <8 x i32> %3
7475  %6 = bitcast <8 x i32> %5 to <4 x i64>
7476  ret <4 x i64> %6
7477}
7478
7479define <4 x i64> @test_mm256_maskz_rorv_epi32(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
7480; X86-LABEL: test_mm256_maskz_rorv_epi32:
7481; X86:       # %bb.0: # %entry
7482; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
7483; X86-NEXT:    kmovw %eax, %k1
7484; X86-NEXT:    vprorvd %ymm1, %ymm0, %ymm0 {%k1} {z}
7485; X86-NEXT:    retl
7486;
7487; X64-LABEL: test_mm256_maskz_rorv_epi32:
7488; X64:       # %bb.0: # %entry
7489; X64-NEXT:    kmovw %edi, %k1
7490; X64-NEXT:    vprorvd %ymm1, %ymm0, %ymm0 {%k1} {z}
7491; X64-NEXT:    retq
7492entry:
7493  %0 = bitcast <4 x i64> %__A to <8 x i32>
7494  %1 = bitcast <4 x i64> %__B to <8 x i32>
7495  %2 = tail call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %0, <8 x i32> %0, <8 x i32> %1)
7496  %3 = bitcast i8 %__U to <8 x i1>
7497  %4 = select <8 x i1> %3, <8 x i32> %2, <8 x i32> zeroinitializer
7498  %5 = bitcast <8 x i32> %4 to <4 x i64>
7499  ret <4 x i64> %5
7500}
7501
7502define <2 x i64> @test_mm_rorv_epi64(<2 x i64> %__A, <2 x i64> %__B) {
7503; CHECK-LABEL: test_mm_rorv_epi64:
7504; CHECK:       # %bb.0: # %entry
7505; CHECK-NEXT:    vprorvq %xmm1, %xmm0, %xmm0
7506; CHECK-NEXT:    ret{{[l|q]}}
7507entry:
7508  %0 = tail call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %__A, <2 x i64> %__A, <2 x i64> %__B)
7509  ret <2 x i64> %0
7510}
7511
7512define <2 x i64> @test_mm_mask_rorv_epi64(<2 x i64> %__W, i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) {
7513; X86-LABEL: test_mm_mask_rorv_epi64:
7514; X86:       # %bb.0: # %entry
7515; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
7516; X86-NEXT:    kmovw %eax, %k1
7517; X86-NEXT:    vprorvq %xmm2, %xmm1, %xmm0 {%k1}
7518; X86-NEXT:    retl
7519;
7520; X64-LABEL: test_mm_mask_rorv_epi64:
7521; X64:       # %bb.0: # %entry
7522; X64-NEXT:    kmovw %edi, %k1
7523; X64-NEXT:    vprorvq %xmm2, %xmm1, %xmm0 {%k1}
7524; X64-NEXT:    retq
7525entry:
7526  %0 = tail call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %__A, <2 x i64> %__A, <2 x i64> %__B)
7527  %1 = bitcast i8 %__U to <8 x i1>
7528  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
7529  %2 = select <2 x i1> %extract.i, <2 x i64> %0, <2 x i64> %__W
7530  ret <2 x i64> %2
7531}
7532
7533define <2 x i64> @test_mm_maskz_rorv_epi64(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) {
7534; X86-LABEL: test_mm_maskz_rorv_epi64:
7535; X86:       # %bb.0: # %entry
7536; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
7537; X86-NEXT:    kmovw %eax, %k1
7538; X86-NEXT:    vprorvq %xmm1, %xmm0, %xmm0 {%k1} {z}
7539; X86-NEXT:    retl
7540;
7541; X64-LABEL: test_mm_maskz_rorv_epi64:
7542; X64:       # %bb.0: # %entry
7543; X64-NEXT:    kmovw %edi, %k1
7544; X64-NEXT:    vprorvq %xmm1, %xmm0, %xmm0 {%k1} {z}
7545; X64-NEXT:    retq
7546entry:
7547  %0 = tail call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %__A, <2 x i64> %__A, <2 x i64> %__B)
7548  %1 = bitcast i8 %__U to <8 x i1>
7549  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
7550  %2 = select <2 x i1> %extract.i, <2 x i64> %0, <2 x i64> zeroinitializer
7551  ret <2 x i64> %2
7552}
7553
7554define <4 x i64> @test_mm256_rorv_epi64(<4 x i64> %__A, <4 x i64> %__B) {
7555; CHECK-LABEL: test_mm256_rorv_epi64:
7556; CHECK:       # %bb.0: # %entry
7557; CHECK-NEXT:    vprorvq %ymm1, %ymm0, %ymm0
7558; CHECK-NEXT:    ret{{[l|q]}}
7559entry:
7560  %0 = tail call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %__A, <4 x i64> %__A, <4 x i64> %__B)
7561  ret <4 x i64> %0
7562}
7563
7564define <4 x i64> @test_mm256_mask_rorv_epi64(<4 x i64> %__W, i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
7565; X86-LABEL: test_mm256_mask_rorv_epi64:
7566; X86:       # %bb.0: # %entry
7567; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
7568; X86-NEXT:    kmovw %eax, %k1
7569; X86-NEXT:    vprorvq %ymm2, %ymm1, %ymm0 {%k1}
7570; X86-NEXT:    retl
7571;
7572; X64-LABEL: test_mm256_mask_rorv_epi64:
7573; X64:       # %bb.0: # %entry
7574; X64-NEXT:    kmovw %edi, %k1
7575; X64-NEXT:    vprorvq %ymm2, %ymm1, %ymm0 {%k1}
7576; X64-NEXT:    retq
7577entry:
7578  %0 = tail call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %__A, <4 x i64> %__A, <4 x i64> %__B)
7579  %1 = bitcast i8 %__U to <8 x i1>
7580  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7581  %2 = select <4 x i1> %extract.i, <4 x i64> %0, <4 x i64> %__W
7582  ret <4 x i64> %2
7583}
7584
7585define <4 x i64> @test_mm256_maskz_rorv_epi64(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
7586; X86-LABEL: test_mm256_maskz_rorv_epi64:
7587; X86:       # %bb.0: # %entry
7588; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
7589; X86-NEXT:    kmovw %eax, %k1
7590; X86-NEXT:    vprorvq %ymm1, %ymm0, %ymm0 {%k1} {z}
7591; X86-NEXT:    retl
7592;
7593; X64-LABEL: test_mm256_maskz_rorv_epi64:
7594; X64:       # %bb.0: # %entry
7595; X64-NEXT:    kmovw %edi, %k1
7596; X64-NEXT:    vprorvq %ymm1, %ymm0, %ymm0 {%k1} {z}
7597; X64-NEXT:    retq
7598entry:
7599  %0 = tail call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %__A, <4 x i64> %__A, <4 x i64> %__B)
7600  %1 = bitcast i8 %__U to <8 x i1>
7601  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7602  %2 = select <4 x i1> %extract.i, <4 x i64> %0, <4 x i64> zeroinitializer
7603  ret <4 x i64> %2
7604}
7605
7606declare <4 x float> @llvm.x86.sse2.cvtdq2ps(<4 x i32>)
7607declare <8 x float> @llvm.x86.avx.cvtdq2.ps.256(<8 x i32>)
7608declare <4 x i32> @llvm.x86.avx512.mask.cvtpd2dq.128(<2 x double>, <4 x i32>, i8)
7609declare <4 x i32> @llvm.x86.avx.cvt.pd2dq.256(<4 x double>)
7610declare <4 x float> @llvm.x86.avx512.mask.cvtpd2ps(<2 x double>, <4 x float>, i8)
7611declare <4 x float> @llvm.x86.avx.cvt.pd2.ps.256(<4 x double>)
7612declare <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.128(<2 x double>, <4 x i32>, i8)
7613declare <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.256(<4 x double>, <4 x i32>, i8)
7614declare <4 x i32> @llvm.x86.sse2.cvtps2dq(<4 x float>)
7615declare <8 x i32> @llvm.x86.avx.cvt.ps2dq.256(<8 x float>)
7616declare <4 x i32> @llvm.x86.avx512.mask.cvtps2udq.128(<4 x float>, <4 x i32>, i8)
7617declare <8 x i32> @llvm.x86.avx512.mask.cvtps2udq.256(<8 x float>, <8 x i32>, i8)
7618declare <4 x i32> @llvm.x86.avx512.mask.cvttpd2dq.128(<2 x double>, <4 x i32>, i8)
7619declare <4 x i32> @llvm.x86.avx.cvtt.pd2dq.256(<4 x double>)
7620declare <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.128(<2 x double>, <4 x i32>, i8)
7621declare <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.256(<4 x double>, <4 x i32>, i8)
7622declare <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float>)
7623declare <8 x i32> @llvm.x86.avx.cvtt.ps2dq.256(<8 x float>)
7624declare <4 x i32> @llvm.x86.avx512.mask.cvttps2udq.128(<4 x float>, <4 x i32>, i8)
7625declare <8 x i32> @llvm.x86.avx512.mask.cvttps2udq.256(<8 x float>, <8 x i32>, i8)
7626declare <8 x i16> @llvm.x86.avx512.mask.pmov.dw.256(<8 x i32>, <8 x i16>, i8)
7627declare <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32>, <4 x i32>, <4 x i32>)
7628declare <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32>, <8 x i32>, <8 x i32>)
7629declare <2 x double> @llvm.x86.avx512.vpermi2var.pd.128(<2 x double>, <2 x i64>, <2 x double>)
7630declare <4 x double> @llvm.x86.avx512.vpermi2var.pd.256(<4 x double>, <4 x i64>, <4 x double>)
7631declare <4 x float> @llvm.x86.avx512.vpermi2var.ps.128(<4 x float>, <4 x i32>, <4 x float>)
7632declare <8 x float> @llvm.x86.avx512.vpermi2var.ps.256(<8 x float>, <8 x i32>, <8 x float>)
7633declare <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64>, <2 x i64>, <2 x i64>)
7634declare <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64>, <4 x i64>, <4 x i64>)
7635declare <2 x double> @llvm.masked.expandload.v2f64(double*, <2 x i1>, <2 x double>)
7636declare <4 x double> @llvm.masked.expandload.v4f64(double*, <4 x i1>, <4 x double>)
7637declare <2 x i64> @llvm.masked.expandload.v2i64(i64*, <2 x i1>, <2 x i64>)
7638declare <4 x i64> @llvm.masked.expandload.v4i64(i64*, <4 x i1>, <4 x i64>)
7639declare <4 x float> @llvm.masked.expandload.v4f32(float*, <4 x i1>, <4 x float>)
7640declare <8 x float> @llvm.masked.expandload.v8f32(float*, <8 x i1>, <8 x float>)
7641declare <4 x i32> @llvm.masked.expandload.v4i32(i32*, <4 x i1>, <4 x i32>)
7642declare <8 x i32> @llvm.masked.expandload.v8i32(i32*, <8 x i1>, <8 x i32>)
7643declare void @llvm.masked.compressstore.v2f64(<2 x double>, double*, <2 x i1>)
7644declare void @llvm.masked.compressstore.v4f64(<4 x double>, double*, <4 x i1>)
7645declare void @llvm.masked.compressstore.v2i64(<2 x i64>, i64*, <2 x i1>)
7646declare void @llvm.masked.compressstore.v4i64(<4 x i64>, i64*, <4 x i1>)
7647declare void @llvm.masked.compressstore.v4f32(<4 x float>, float*, <4 x i1>)
7648declare void @llvm.masked.compressstore.v8f32(<8 x float>, float*, <8 x i1>)
7649declare void @llvm.masked.compressstore.v4i32(<4 x i32>, i32*, <4 x i1>)
7650declare void @llvm.masked.compressstore.v8i32(<8 x i32>, i32*, <8 x i1>)
7651declare <4 x i32> @llvm.fshl.v4i32(<4 x i32>, <4 x i32>, <4 x i32>)
7652declare <8 x i32> @llvm.fshl.v8i32(<8 x i32>, <8 x i32>, <8 x i32>)
7653declare <2 x i64> @llvm.fshl.v2i64(<2 x i64>, <2 x i64>, <2 x i64>)
7654declare <4 x i64> @llvm.fshl.v4i64(<4 x i64>, <4 x i64>, <4 x i64>)
7655declare <4 x i32> @llvm.fshr.v4i32(<4 x i32>, <4 x i32>, <4 x i32>)
7656declare <8 x i32> @llvm.fshr.v8i32(<8 x i32>, <8 x i32>, <8 x i32>)
7657declare <2 x i64> @llvm.fshr.v2i64(<2 x i64>, <2 x i64>, <2 x i64>)
7658declare <4 x i64> @llvm.fshr.v4i64(<4 x i64>, <4 x i64>, <4 x i64>)
7659
7660!0 = !{i32 1}
7661