1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,X86
3; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,X64
4
5; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/avx512f-builtins.c
6
7
8define zeroext i16 @test_mm512_kunpackb(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C, <8 x i64> %__D, <8 x i64> %__E, <8 x i64> %__F) local_unnamed_addr #0 {
9; X86-LABEL: test_mm512_kunpackb:
10; X86:       # %bb.0: # %entry
11; X86-NEXT:    pushl %ebp
12; X86-NEXT:    .cfi_def_cfa_offset 8
13; X86-NEXT:    .cfi_offset %ebp, -8
14; X86-NEXT:    movl %esp, %ebp
15; X86-NEXT:    .cfi_def_cfa_register %ebp
16; X86-NEXT:    andl $-64, %esp
17; X86-NEXT:    subl $64, %esp
18; X86-NEXT:    vmovdqa64 136(%ebp), %zmm3
19; X86-NEXT:    vpcmpneqd %zmm1, %zmm0, %k0
20; X86-NEXT:    vpcmpneqd 8(%ebp), %zmm2, %k1
21; X86-NEXT:    kunpckbw %k0, %k1, %k1
22; X86-NEXT:    vpcmpneqd 72(%ebp), %zmm3, %k0 {%k1}
23; X86-NEXT:    kmovw %k0, %eax
24; X86-NEXT:    movzwl %ax, %eax
25; X86-NEXT:    movl %ebp, %esp
26; X86-NEXT:    popl %ebp
27; X86-NEXT:    .cfi_def_cfa %esp, 4
28; X86-NEXT:    vzeroupper
29; X86-NEXT:    retl
30;
31; X64-LABEL: test_mm512_kunpackb:
32; X64:       # %bb.0: # %entry
33; X64-NEXT:    vpcmpneqd %zmm1, %zmm0, %k0
34; X64-NEXT:    vpcmpneqd %zmm3, %zmm2, %k1
35; X64-NEXT:    kunpckbw %k0, %k1, %k1
36; X64-NEXT:    vpcmpneqd %zmm5, %zmm4, %k0 {%k1}
37; X64-NEXT:    kmovw %k0, %eax
38; X64-NEXT:    movzwl %ax, %eax
39; X64-NEXT:    vzeroupper
40; X64-NEXT:    retq
41entry:
42  %0 = bitcast <8 x i64> %__E to <16 x i32>
43  %1 = bitcast <8 x i64> %__F to <16 x i32>
44  %2 = bitcast <8 x i64> %__A to <16 x i32>
45  %3 = bitcast <8 x i64> %__B to <16 x i32>
46  %4 = icmp ne <16 x i32> %2, %3
47  %5 = bitcast <8 x i64> %__C to <16 x i32>
48  %6 = bitcast <8 x i64> %__D to <16 x i32>
49  %7 = icmp ne <16 x i32> %5, %6
50  %8 = shufflevector <16 x i1> %4, <16 x i1> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
51  %9 = shufflevector <16 x i1> %7, <16 x i1> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
52  %10 = shufflevector <8 x i1> %8, <8 x i1> %9, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
53  %11 = icmp ne <16 x i32> %0, %1
54  %12 = and <16 x i1> %11, %10
55  %13 = bitcast <16 x i1> %12 to i16
56  ret i16 %13
57}
58
59define i32 @test_mm512_kortestc(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C, <8 x i64> %__D) {
60; X86-LABEL: test_mm512_kortestc:
61; X86:       # %bb.0: # %entry
62; X86-NEXT:    pushl %ebp
63; X86-NEXT:    .cfi_def_cfa_offset 8
64; X86-NEXT:    .cfi_offset %ebp, -8
65; X86-NEXT:    movl %esp, %ebp
66; X86-NEXT:    .cfi_def_cfa_register %ebp
67; X86-NEXT:    andl $-64, %esp
68; X86-NEXT:    subl $64, %esp
69; X86-NEXT:    vpcmpneqd %zmm1, %zmm0, %k0
70; X86-NEXT:    vpcmpneqd 8(%ebp), %zmm2, %k1
71; X86-NEXT:    korw %k0, %k1, %k0
72; X86-NEXT:    kmovw %k0, %eax
73; X86-NEXT:    cmpw $-1, %ax
74; X86-NEXT:    sete %al
75; X86-NEXT:    andb $1, %al
76; X86-NEXT:    movzbl %al, %eax
77; X86-NEXT:    movl %ebp, %esp
78; X86-NEXT:    popl %ebp
79; X86-NEXT:    .cfi_def_cfa %esp, 4
80; X86-NEXT:    vzeroupper
81; X86-NEXT:    retl
82;
83; X64-LABEL: test_mm512_kortestc:
84; X64:       # %bb.0: # %entry
85; X64-NEXT:    vpcmpneqd %zmm1, %zmm0, %k0
86; X64-NEXT:    vpcmpneqd %zmm3, %zmm2, %k1
87; X64-NEXT:    korw %k0, %k1, %k0
88; X64-NEXT:    kmovw %k0, %eax
89; X64-NEXT:    cmpw $-1, %ax
90; X64-NEXT:    sete %al
91; X64-NEXT:    andb $1, %al
92; X64-NEXT:    movzbl %al, %eax
93; X64-NEXT:    vzeroupper
94; X64-NEXT:    retq
95entry:
96  %0 = bitcast <8 x i64> %__A to <16 x i32>
97  %1 = bitcast <8 x i64> %__B to <16 x i32>
98  %2 = icmp ne <16 x i32> %0, %1
99  %3 = bitcast <8 x i64> %__C to <16 x i32>
100  %4 = bitcast <8 x i64> %__D to <16 x i32>
101  %5 = icmp ne <16 x i32> %3, %4
102  %6 = or <16 x i1> %5, %2                                                                                                                                                                                                                                 %7 = bitcast <16 x i1> %6 to i16
103  %8 = icmp eq i16 %7, -1
104  %9 = zext i1 %8 to i32
105  ret i32 %9
106}
107
108define i32 @test_mm512_kortestz(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C, <8 x i64> %__D) {
109; X86-LABEL: test_mm512_kortestz:
110; X86:       # %bb.0: # %entry
111; X86-NEXT:    pushl %ebp
112; X86-NEXT:    .cfi_def_cfa_offset 8
113; X86-NEXT:    .cfi_offset %ebp, -8
114; X86-NEXT:    movl %esp, %ebp
115; X86-NEXT:    .cfi_def_cfa_register %ebp
116; X86-NEXT:    andl $-64, %esp
117; X86-NEXT:    subl $64, %esp
118; X86-NEXT:    vpcmpneqd %zmm1, %zmm0, %k0
119; X86-NEXT:    vpcmpneqd 8(%ebp), %zmm2, %k1
120; X86-NEXT:    korw %k0, %k1, %k0
121; X86-NEXT:    kmovw %k0, %eax
122; X86-NEXT:    cmpw $0, %ax
123; X86-NEXT:    sete %al
124; X86-NEXT:    andb $1, %al
125; X86-NEXT:    movzbl %al, %eax
126; X86-NEXT:    movl %ebp, %esp
127; X86-NEXT:    popl %ebp
128; X86-NEXT:    .cfi_def_cfa %esp, 4
129; X86-NEXT:    vzeroupper
130; X86-NEXT:    retl
131;
132; X64-LABEL: test_mm512_kortestz:
133; X64:       # %bb.0: # %entry
134; X64-NEXT:    vpcmpneqd %zmm1, %zmm0, %k0
135; X64-NEXT:    vpcmpneqd %zmm3, %zmm2, %k1
136; X64-NEXT:    korw %k0, %k1, %k0
137; X64-NEXT:    kmovw %k0, %eax
138; X64-NEXT:    cmpw $0, %ax
139; X64-NEXT:    sete %al
140; X64-NEXT:    andb $1, %al
141; X64-NEXT:    movzbl %al, %eax
142; X64-NEXT:    vzeroupper
143; X64-NEXT:    retq
144entry:
145  %0 = bitcast <8 x i64> %__A to <16 x i32>
146  %1 = bitcast <8 x i64> %__B to <16 x i32>
147  %2 = icmp ne <16 x i32> %0, %1
148  %3 = bitcast <8 x i64> %__C to <16 x i32>
149  %4 = bitcast <8 x i64> %__D to <16 x i32>
150  %5 = icmp ne <16 x i32> %3, %4
151  %6 = or <16 x i1> %5, %2
152  %7 = bitcast <16 x i1> %6 to i16
153  %8 = icmp eq i16 %7, 0
154  %9 = zext i1 %8 to i32
155  ret i32 %9
156}
157
158define <16 x float> @test_mm512_shuffle_f32x4(<16 x float> %__A, <16 x float> %__B) {
159; CHECK-LABEL: test_mm512_shuffle_f32x4:
160; CHECK:       # %bb.0: # %entry
161; CHECK-NEXT:    vshuff32x4 {{.*#+}} zmm0 = zmm0[0,1,2,3,4,5,6,7],zmm1[0,1,2,3,0,1,2,3]
162; CHECK-NEXT:    ret{{[l|q]}}
163entry:
164  %shuffle = shufflevector <16 x float> %__A, <16 x float> %__B, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 16, i32 17, i32 18, i32 19>
165  ret <16 x float> %shuffle
166}
167
168
169define <16 x float> @test_mm512_mask_shuffle_f32x4(<16 x float> %__W, i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) {
170; X86-LABEL: test_mm512_mask_shuffle_f32x4:
171; X86:       # %bb.0: # %entry
172; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
173; X86-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} = zmm1[0,1,2,3,4,5,6,7],zmm2[0,1,2,3,0,1,2,3]
174; X86-NEXT:    retl
175;
176; X64-LABEL: test_mm512_mask_shuffle_f32x4:
177; X64:       # %bb.0: # %entry
178; X64-NEXT:    kmovw %edi, %k1
179; X64-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} = zmm1[0,1,2,3,4,5,6,7],zmm2[0,1,2,3,0,1,2,3]
180; X64-NEXT:    retq
181entry:
182  %shuffle = shufflevector <16 x float> %__A, <16 x float> %__B, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 16, i32 17, i32 18, i32 19>
183  %0 = bitcast i16 %__U to <16 x i1>
184  %1 = select <16 x i1> %0, <16 x float> %shuffle, <16 x float> %__W
185  ret <16 x float> %1
186}
187
188define <16 x float> @test_mm512_maskz_shuffle_f32x4(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) {
189; X86-LABEL: test_mm512_maskz_shuffle_f32x4:
190; X86:       # %bb.0: # %entry
191; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
192; X86-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,5,6,7],zmm1[0,1,2,3,0,1,2,3]
193; X86-NEXT:    retl
194;
195; X64-LABEL: test_mm512_maskz_shuffle_f32x4:
196; X64:       # %bb.0: # %entry
197; X64-NEXT:    kmovw %edi, %k1
198; X64-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,5,6,7],zmm1[0,1,2,3,0,1,2,3]
199; X64-NEXT:    retq
200entry:
201  %shuffle = shufflevector <16 x float> %__A, <16 x float> %__B, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 16, i32 17, i32 18, i32 19>
202  %0 = bitcast i16 %__U to <16 x i1>
203  %1 = select <16 x i1> %0, <16 x float> %shuffle, <16 x float> zeroinitializer
204  ret <16 x float> %1
205}
206
207define <8 x double> @test_mm512_shuffle_f64x2(<8 x double> %__A, <8 x double> %__B) {
208; CHECK-LABEL: test_mm512_shuffle_f64x2:
209; CHECK:       # %bb.0: # %entry
210; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[0,1,0,1]
211; CHECK-NEXT:    ret{{[l|q]}}
212entry:
213  %shuffle = shufflevector <8 x double> %__A, <8 x double> %__B, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 8, i32 9>
214  ret <8 x double> %shuffle
215}
216
217define <8 x double> @test_mm512_mask_shuffle_f64x2(<8 x double> %__W, i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) {
218; X86-LABEL: test_mm512_mask_shuffle_f64x2:
219; X86:       # %bb.0: # %entry
220; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
221; X86-NEXT:    kmovw %eax, %k1
222; X86-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} = zmm1[0,1,2,3],zmm2[0,1,0,1]
223; X86-NEXT:    retl
224;
225; X64-LABEL: test_mm512_mask_shuffle_f64x2:
226; X64:       # %bb.0: # %entry
227; X64-NEXT:    kmovw %edi, %k1
228; X64-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} = zmm1[0,1,2,3],zmm2[0,1,0,1]
229; X64-NEXT:    retq
230entry:
231  %shuffle = shufflevector <8 x double> %__A, <8 x double> %__B, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 8, i32 9>
232  %0 = bitcast i8 %__U to <8 x i1>
233  %1 = select <8 x i1> %0, <8 x double> %shuffle, <8 x double> %__W
234  ret <8 x double> %1
235}
236
237define <8 x double> @test_mm512_maskz_shuffle_f64x2(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) {
238; X86-LABEL: test_mm512_maskz_shuffle_f64x2:
239; X86:       # %bb.0: # %entry
240; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
241; X86-NEXT:    kmovw %eax, %k1
242; X86-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3],zmm1[0,1,0,1]
243; X86-NEXT:    retl
244;
245; X64-LABEL: test_mm512_maskz_shuffle_f64x2:
246; X64:       # %bb.0: # %entry
247; X64-NEXT:    kmovw %edi, %k1
248; X64-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3],zmm1[0,1,0,1]
249; X64-NEXT:    retq
250entry:
251  %shuffle = shufflevector <8 x double> %__A, <8 x double> %__B, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 8, i32 9>
252  %0 = bitcast i8 %__U to <8 x i1>
253  %1 = select <8 x i1> %0, <8 x double> %shuffle, <8 x double> zeroinitializer
254  ret <8 x double> %1
255}
256
257define <8 x i64> @test_mm512_shuffle_i32x4(<8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 {
258; CHECK-LABEL: test_mm512_shuffle_i32x4:
259; CHECK:       # %bb.0: # %entry
260; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[0,1,0,1]
261; CHECK-NEXT:    ret{{[l|q]}}
262entry:
263  %shuffle = shufflevector <8 x i64> %__A, <8 x i64> %__B, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 8, i32 9>
264  ret <8 x i64> %shuffle
265}
266
267define <8 x i64> @test_mm512_mask_shuffle_i32x4(<8 x i64> %__W, i16 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 {
268; X86-LABEL: test_mm512_mask_shuffle_i32x4:
269; X86:       # %bb.0: # %entry
270; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
271; X86-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} = zmm1[0,1,2,3,4,5,6,7],zmm2[0,1,2,3,0,1,2,3]
272; X86-NEXT:    retl
273;
274; X64-LABEL: test_mm512_mask_shuffle_i32x4:
275; X64:       # %bb.0: # %entry
276; X64-NEXT:    kmovw %edi, %k1
277; X64-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} = zmm1[0,1,2,3,4,5,6,7],zmm2[0,1,2,3,0,1,2,3]
278; X64-NEXT:    retq
279entry:
280  %shuffle = shufflevector <8 x i64> %__A, <8 x i64> %__B, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 8, i32 9>
281  %0 = bitcast <8 x i64> %shuffle to <16 x i32>
282  %1 = bitcast <8 x i64> %__W to <16 x i32>
283  %2 = bitcast i16 %__U to <16 x i1>
284  %3 = select <16 x i1> %2, <16 x i32> %0, <16 x i32> %1
285  %4 = bitcast <16 x i32> %3 to <8 x i64>
286  ret <8 x i64> %4
287}
288
289define <8 x i64> @test_mm512_maskz_shuffle_i32x4(i16 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 {
290; X86-LABEL: test_mm512_maskz_shuffle_i32x4:
291; X86:       # %bb.0: # %entry
292; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
293; X86-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,5,6,7],zmm1[0,1,2,3,0,1,2,3]
294; X86-NEXT:    retl
295;
296; X64-LABEL: test_mm512_maskz_shuffle_i32x4:
297; X64:       # %bb.0: # %entry
298; X64-NEXT:    kmovw %edi, %k1
299; X64-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,5,6,7],zmm1[0,1,2,3,0,1,2,3]
300; X64-NEXT:    retq
301entry:
302  %shuffle = shufflevector <8 x i64> %__A, <8 x i64> %__B, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 8, i32 9>
303  %0 = bitcast <8 x i64> %shuffle to <16 x i32>
304  %1 = bitcast i16 %__U to <16 x i1>
305  %2 = select <16 x i1> %1, <16 x i32> %0, <16 x i32> zeroinitializer
306  %3 = bitcast <16 x i32> %2 to <8 x i64>
307  ret <8 x i64> %3
308}
309
310define <8 x i64> @test_mm512_shuffle_i64x2(<8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 {
311; CHECK-LABEL: test_mm512_shuffle_i64x2:
312; CHECK:       # %bb.0: # %entry
313; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[0,1,0,1]
314; CHECK-NEXT:    ret{{[l|q]}}
315entry:
316  %shuffle = shufflevector <8 x i64> %__A, <8 x i64> %__B, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 8, i32 9>
317  ret <8 x i64> %shuffle
318}
319
320define <8 x i64> @test_mm512_mask_shuffle_i64x2(<8 x i64> %__W, i8 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 {
321; X86-LABEL: test_mm512_mask_shuffle_i64x2:
322; X86:       # %bb.0: # %entry
323; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
324; X86-NEXT:    kmovw %eax, %k1
325; X86-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} = zmm1[0,1,2,3],zmm2[0,1,0,1]
326; X86-NEXT:    retl
327;
328; X64-LABEL: test_mm512_mask_shuffle_i64x2:
329; X64:       # %bb.0: # %entry
330; X64-NEXT:    kmovw %edi, %k1
331; X64-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} = zmm1[0,1,2,3],zmm2[0,1,0,1]
332; X64-NEXT:    retq
333entry:
334  %shuffle = shufflevector <8 x i64> %__A, <8 x i64> %__B, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 8, i32 9>
335  %0 = bitcast i8 %__U to <8 x i1>
336  %1 = select <8 x i1> %0, <8 x i64> %shuffle, <8 x i64> %__W
337  ret <8 x i64> %1
338}
339
340define <8 x i64> @test_mm512_maskz_shuffle_i64x2(i8 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 {
341; X86-LABEL: test_mm512_maskz_shuffle_i64x2:
342; X86:       # %bb.0: # %entry
343; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
344; X86-NEXT:    kmovw %eax, %k1
345; X86-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3],zmm1[0,1,0,1]
346; X86-NEXT:    retl
347;
348; X64-LABEL: test_mm512_maskz_shuffle_i64x2:
349; X64:       # %bb.0: # %entry
350; X64-NEXT:    kmovw %edi, %k1
351; X64-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3],zmm1[0,1,0,1]
352; X64-NEXT:    retq
353entry:
354  %shuffle = shufflevector <8 x i64> %__A, <8 x i64> %__B, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 8, i32 9>
355  %0 = bitcast i8 %__U to <8 x i1>
356  %1 = select <8 x i1> %0, <8 x i64> %shuffle, <8 x i64> zeroinitializer
357  ret <8 x i64> %1
358}
359
360
361define zeroext i16 @test_mm512_testn_epi32_mask(<8 x i64> %__A, <8 x i64> %__B) {
362; CHECK-LABEL: test_mm512_testn_epi32_mask:
363; CHECK:       # %bb.0: # %entry
364; CHECK-NEXT:    vptestnmd %zmm0, %zmm1, %k0
365; CHECK-NEXT:    kmovw %k0, %eax
366; CHECK-NEXT:    movzwl %ax, %eax
367; CHECK-NEXT:    vzeroupper
368; CHECK-NEXT:    ret{{[l|q]}}
369entry:
370  %and1.i.i = and <8 x i64> %__B, %__A
371  %0 = bitcast <8 x i64> %and1.i.i to <16 x i32>
372  %1 = icmp eq <16 x i32> %0, zeroinitializer
373  %2 = bitcast <16 x i1> %1 to i16
374  ret i16 %2
375}
376
377define zeroext i16 @test_mm512_mask_testn_epi32_mask(i16 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) {
378; X86-LABEL: test_mm512_mask_testn_epi32_mask:
379; X86:       # %bb.0: # %entry
380; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
381; X86-NEXT:    vptestnmd %zmm0, %zmm1, %k0 {%k1}
382; X86-NEXT:    kmovw %k0, %eax
383; X86-NEXT:    movzwl %ax, %eax
384; X86-NEXT:    vzeroupper
385; X86-NEXT:    retl
386;
387; X64-LABEL: test_mm512_mask_testn_epi32_mask:
388; X64:       # %bb.0: # %entry
389; X64-NEXT:    kmovw %edi, %k1
390; X64-NEXT:    vptestnmd %zmm0, %zmm1, %k0 {%k1}
391; X64-NEXT:    kmovw %k0, %eax
392; X64-NEXT:    movzwl %ax, %eax
393; X64-NEXT:    vzeroupper
394; X64-NEXT:    retq
395entry:
396  %and1.i.i = and <8 x i64> %__B, %__A
397  %0 = bitcast <8 x i64> %and1.i.i to <16 x i32>
398  %1 = icmp eq <16 x i32> %0, zeroinitializer
399  %2 = bitcast i16 %__U to <16 x i1>
400  %3 = and <16 x i1> %1, %2
401  %4 = bitcast <16 x i1> %3 to i16
402  ret i16 %4
403}
404
405define zeroext i8 @test_mm512_testn_epi64_mask(<8 x i64> %__A, <8 x i64> %__B) {
406; CHECK-LABEL: test_mm512_testn_epi64_mask:
407; CHECK:       # %bb.0: # %entry
408; CHECK-NEXT:    vptestnmq %zmm0, %zmm1, %k0
409; CHECK-NEXT:    kmovw %k0, %eax
410; CHECK-NEXT:    movzbl %al, %eax
411; CHECK-NEXT:    vzeroupper
412; CHECK-NEXT:    ret{{[l|q]}}
413entry:
414  %and1.i.i = and <8 x i64> %__B, %__A
415  %0 = icmp eq <8 x i64> %and1.i.i, zeroinitializer
416  %1 = bitcast <8 x i1> %0 to i8
417  ret i8 %1
418}
419
420define zeroext i8 @test_mm512_mask_testn_epi64_mask(i8 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) {
421; X86-LABEL: test_mm512_mask_testn_epi64_mask:
422; X86:       # %bb.0: # %entry
423; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
424; X86-NEXT:    kmovw %eax, %k1
425; X86-NEXT:    vptestnmq %zmm0, %zmm1, %k0 {%k1}
426; X86-NEXT:    kmovw %k0, %eax
427; X86-NEXT:    movzbl %al, %eax
428; X86-NEXT:    vzeroupper
429; X86-NEXT:    retl
430;
431; X64-LABEL: test_mm512_mask_testn_epi64_mask:
432; X64:       # %bb.0: # %entry
433; X64-NEXT:    kmovw %edi, %k1
434; X64-NEXT:    vptestnmq %zmm0, %zmm1, %k0 {%k1}
435; X64-NEXT:    kmovw %k0, %eax
436; X64-NEXT:    movzbl %al, %eax
437; X64-NEXT:    vzeroupper
438; X64-NEXT:    retq
439entry:
440  %and1.i.i = and <8 x i64> %__B, %__A
441  %0 = icmp eq <8 x i64> %and1.i.i, zeroinitializer
442  %1 = bitcast i8 %__U to <8 x i1>
443  %2 = and <8 x i1> %0, %1
444  %3 = bitcast <8 x i1> %2 to i8
445  ret i8 %3
446}
447
448define zeroext i16 @test_mm512_mask_test_epi32_mask(i16 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) {
449; X86-LABEL: test_mm512_mask_test_epi32_mask:
450; X86:       # %bb.0: # %entry
451; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
452; X86-NEXT:    vptestmd %zmm0, %zmm1, %k0 {%k1}
453; X86-NEXT:    kmovw %k0, %eax
454; X86-NEXT:    movzwl %ax, %eax
455; X86-NEXT:    vzeroupper
456; X86-NEXT:    retl
457;
458; X64-LABEL: test_mm512_mask_test_epi32_mask:
459; X64:       # %bb.0: # %entry
460; X64-NEXT:    kmovw %edi, %k1
461; X64-NEXT:    vptestmd %zmm0, %zmm1, %k0 {%k1}
462; X64-NEXT:    kmovw %k0, %eax
463; X64-NEXT:    movzwl %ax, %eax
464; X64-NEXT:    vzeroupper
465; X64-NEXT:    retq
466entry:
467  %and1.i.i = and <8 x i64> %__B, %__A
468  %0 = bitcast <8 x i64> %and1.i.i to <16 x i32>
469  %1 = icmp ne <16 x i32> %0, zeroinitializer
470  %2 = bitcast i16 %__U to <16 x i1>
471  %3 = and <16 x i1> %1, %2
472  %4 = bitcast <16 x i1> %3 to i16
473  ret i16 %4
474}
475
476define zeroext i8 @test_mm512_mask_test_epi64_mask(i8 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) {
477; X86-LABEL: test_mm512_mask_test_epi64_mask:
478; X86:       # %bb.0: # %entry
479; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
480; X86-NEXT:    kmovw %eax, %k1
481; X86-NEXT:    vptestmq %zmm0, %zmm1, %k0 {%k1}
482; X86-NEXT:    kmovw %k0, %eax
483; X86-NEXT:    movzbl %al, %eax
484; X86-NEXT:    vzeroupper
485; X86-NEXT:    retl
486;
487; X64-LABEL: test_mm512_mask_test_epi64_mask:
488; X64:       # %bb.0: # %entry
489; X64-NEXT:    kmovw %edi, %k1
490; X64-NEXT:    vptestmq %zmm0, %zmm1, %k0 {%k1}
491; X64-NEXT:    kmovw %k0, %eax
492; X64-NEXT:    movzbl %al, %eax
493; X64-NEXT:    vzeroupper
494; X64-NEXT:    retq
495entry:
496  %and1.i.i = and <8 x i64> %__B, %__A
497  %0 = icmp ne <8 x i64> %and1.i.i, zeroinitializer
498  %1 = bitcast i8 %__U to <8 x i1>
499  %2 = and <8 x i1> %0, %1
500  %3 = bitcast <8 x i1> %2 to i8
501  ret i8 %3
502}
503
504define <8 x i64> @test_mm512_mask_set1_epi32(<8 x i64> %__O, i16 zeroext %__M, i32 %__A) {
505; X86-LABEL: test_mm512_mask_set1_epi32:
506; X86:       # %bb.0: # %entry
507; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
508; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
509; X86-NEXT:    vpbroadcastd %eax, %zmm0 {%k1}
510; X86-NEXT:    retl
511;
512; X64-LABEL: test_mm512_mask_set1_epi32:
513; X64:       # %bb.0: # %entry
514; X64-NEXT:    kmovw %edi, %k1
515; X64-NEXT:    vpbroadcastd %esi, %zmm0 {%k1}
516; X64-NEXT:    retq
517entry:
518  %vecinit.i.i = insertelement <16 x i32> undef, i32 %__A, i32 0
519  %vecinit15.i.i = shufflevector <16 x i32> %vecinit.i.i, <16 x i32> undef, <16 x i32> zeroinitializer
520  %0 = bitcast <8 x i64> %__O to <16 x i32>
521  %1 = bitcast i16 %__M to <16 x i1>
522  %2 = select <16 x i1> %1, <16 x i32> %vecinit15.i.i, <16 x i32> %0
523  %3 = bitcast <16 x i32> %2 to <8 x i64>
524  ret <8 x i64> %3
525}
526
527define <8 x i64> @test_mm512_maskz_set1_epi32(i16 zeroext %__M, i32 %__A)  {
528; X86-LABEL: test_mm512_maskz_set1_epi32:
529; X86:       # %bb.0: # %entry
530; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
531; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
532; X86-NEXT:    vpbroadcastd %eax, %zmm0 {%k1} {z}
533; X86-NEXT:    retl
534;
535; X64-LABEL: test_mm512_maskz_set1_epi32:
536; X64:       # %bb.0: # %entry
537; X64-NEXT:    kmovw %edi, %k1
538; X64-NEXT:    vpbroadcastd %esi, %zmm0 {%k1} {z}
539; X64-NEXT:    retq
540entry:
541  %vecinit.i.i = insertelement <16 x i32> undef, i32 %__A, i32 0
542  %vecinit15.i.i = shufflevector <16 x i32> %vecinit.i.i, <16 x i32> undef, <16 x i32> zeroinitializer
543  %0 = bitcast i16 %__M to <16 x i1>
544  %1 = select <16 x i1> %0, <16 x i32> %vecinit15.i.i, <16 x i32> zeroinitializer
545  %2 = bitcast <16 x i32> %1 to <8 x i64>
546  ret <8 x i64> %2
547}
548
549define <8 x i64> @test_mm512_mask_set1_epi64(<8 x i64> %__O, i8 zeroext %__M, i64 %__A) {
550; X86-LABEL: test_mm512_mask_set1_epi64:
551; X86:       # %bb.0: # %entry
552; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
553; X86-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
554; X86-NEXT:    vpinsrd $1, {{[0-9]+}}(%esp), %xmm1, %xmm1
555; X86-NEXT:    kmovw %eax, %k1
556; X86-NEXT:    vpbroadcastq %xmm1, %zmm0 {%k1}
557; X86-NEXT:    retl
558;
559; X64-LABEL: test_mm512_mask_set1_epi64:
560; X64:       # %bb.0: # %entry
561; X64-NEXT:    kmovw %edi, %k1
562; X64-NEXT:    vpbroadcastq %rsi, %zmm0 {%k1}
563; X64-NEXT:    retq
564entry:
565  %vecinit.i.i = insertelement <8 x i64> undef, i64 %__A, i32 0
566  %vecinit7.i.i = shufflevector <8 x i64> %vecinit.i.i, <8 x i64> undef, <8 x i32> zeroinitializer
567  %0 = bitcast i8 %__M to <8 x i1>
568  %1 = select <8 x i1> %0, <8 x i64> %vecinit7.i.i, <8 x i64> %__O
569  ret <8 x i64> %1
570}
571
572define <8 x i64> @test_mm512_maskz_set1_epi64(i8 zeroext %__M, i64 %__A)  {
573; X86-LABEL: test_mm512_maskz_set1_epi64:
574; X86:       # %bb.0: # %entry
575; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
576; X86-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
577; X86-NEXT:    vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
578; X86-NEXT:    kmovw %eax, %k1
579; X86-NEXT:    vpbroadcastq %xmm0, %zmm0 {%k1} {z}
580; X86-NEXT:    retl
581;
582; X64-LABEL: test_mm512_maskz_set1_epi64:
583; X64:       # %bb.0: # %entry
584; X64-NEXT:    kmovw %edi, %k1
585; X64-NEXT:    vpbroadcastq %rsi, %zmm0 {%k1} {z}
586; X64-NEXT:    retq
587entry:
588  %vecinit.i.i = insertelement <8 x i64> undef, i64 %__A, i32 0
589  %vecinit7.i.i = shufflevector <8 x i64> %vecinit.i.i, <8 x i64> undef, <8 x i32> zeroinitializer
590  %0 = bitcast i8 %__M to <8 x i1>
591  %1 = select <8 x i1> %0, <8 x i64> %vecinit7.i.i, <8 x i64> zeroinitializer
592  ret <8 x i64> %1
593}
594
595
596define <8 x i64> @test_mm512_broadcastd_epi32(<2 x i64> %a0) {
597; CHECK-LABEL: test_mm512_broadcastd_epi32:
598; CHECK:       # %bb.0:
599; CHECK-NEXT:    vbroadcastss %xmm0, %zmm0
600; CHECK-NEXT:    ret{{[l|q]}}
601  %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
602  %res0 = shufflevector <4 x i32> %arg0, <4 x i32> undef, <16 x i32> zeroinitializer
603  %res1 = bitcast <16 x i32> %res0 to <8 x i64>
604  ret <8 x i64> %res1
605}
606
607define <8 x i64> @test_mm512_mask_broadcastd_epi32(<8 x i64> %a0, i16 %a1, <2 x i64> %a2) {
608; X86-LABEL: test_mm512_mask_broadcastd_epi32:
609; X86:       # %bb.0:
610; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
611; X86-NEXT:    vpbroadcastd %xmm1, %zmm0 {%k1}
612; X86-NEXT:    retl
613;
614; X64-LABEL: test_mm512_mask_broadcastd_epi32:
615; X64:       # %bb.0:
616; X64-NEXT:    kmovw %edi, %k1
617; X64-NEXT:    vpbroadcastd %xmm1, %zmm0 {%k1}
618; X64-NEXT:    retq
619  %arg0 = bitcast <8 x i64> %a0 to <16 x i32>
620  %arg1 = bitcast i16 %a1 to <16 x i1>
621  %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
622  %res0 = shufflevector <4 x i32> %arg2, <4 x i32> undef, <16 x i32> zeroinitializer
623  %res1 = select <16 x i1> %arg1, <16 x i32> %res0, <16 x i32> %arg0
624  %res2 = bitcast <16 x i32> %res1 to <8 x i64>
625  ret <8 x i64> %res2
626}
627
628define <8 x i64> @test_mm512_maskz_broadcastd_epi32(i16 %a0, <2 x i64> %a1) {
629; X86-LABEL: test_mm512_maskz_broadcastd_epi32:
630; X86:       # %bb.0:
631; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
632; X86-NEXT:    vpbroadcastd %xmm0, %zmm0 {%k1} {z}
633; X86-NEXT:    retl
634;
635; X64-LABEL: test_mm512_maskz_broadcastd_epi32:
636; X64:       # %bb.0:
637; X64-NEXT:    kmovw %edi, %k1
638; X64-NEXT:    vpbroadcastd %xmm0, %zmm0 {%k1} {z}
639; X64-NEXT:    retq
640  %arg0 = bitcast i16 %a0 to <16 x i1>
641  %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
642  %res0 = shufflevector <4 x i32> %arg1, <4 x i32> undef, <16 x i32> zeroinitializer
643  %res1 = select <16 x i1> %arg0, <16 x i32> %res0, <16 x i32> zeroinitializer
644  %res2 = bitcast <16 x i32> %res1 to <8 x i64>
645  ret <8 x i64> %res2
646}
647
648define <8 x i64> @test_mm512_broadcastq_epi64(<2 x i64> %a0) {
649; CHECK-LABEL: test_mm512_broadcastq_epi64:
650; CHECK:       # %bb.0:
651; CHECK-NEXT:    vbroadcastsd %xmm0, %zmm0
652; CHECK-NEXT:    ret{{[l|q]}}
653  %res = shufflevector <2 x i64> %a0, <2 x i64> undef, <8 x i32> zeroinitializer
654  ret <8 x i64> %res
655}
656
657define <8 x i64> @test_mm512_mask_broadcastq_epi64(<8 x i64> %a0, i8 %a1, <2 x i64> %a2) {
658; X86-LABEL: test_mm512_mask_broadcastq_epi64:
659; X86:       # %bb.0:
660; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
661; X86-NEXT:    kmovw %eax, %k1
662; X86-NEXT:    vpbroadcastq %xmm1, %zmm0 {%k1}
663; X86-NEXT:    retl
664;
665; X64-LABEL: test_mm512_mask_broadcastq_epi64:
666; X64:       # %bb.0:
667; X64-NEXT:    kmovw %edi, %k1
668; X64-NEXT:    vpbroadcastq %xmm1, %zmm0 {%k1}
669; X64-NEXT:    retq
670  %arg1 = bitcast i8 %a1 to <8 x i1>
671  %res0 = shufflevector <2 x i64> %a2, <2 x i64> undef, <8 x i32> zeroinitializer
672  %res1 = select <8 x i1> %arg1, <8 x i64> %res0, <8 x i64> %a0
673  ret <8 x i64> %res1
674}
675
676define <8 x i64> @test_mm512_maskz_broadcastq_epi64(i8 %a0, <2 x i64> %a1) {
677; X86-LABEL: test_mm512_maskz_broadcastq_epi64:
678; X86:       # %bb.0:
679; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
680; X86-NEXT:    kmovw %eax, %k1
681; X86-NEXT:    vpbroadcastq %xmm0, %zmm0 {%k1} {z}
682; X86-NEXT:    retl
683;
684; X64-LABEL: test_mm512_maskz_broadcastq_epi64:
685; X64:       # %bb.0:
686; X64-NEXT:    kmovw %edi, %k1
687; X64-NEXT:    vpbroadcastq %xmm0, %zmm0 {%k1} {z}
688; X64-NEXT:    retq
689  %arg0 = bitcast i8 %a0 to <8 x i1>
690  %res0 = shufflevector <2 x i64> %a1, <2 x i64> undef, <8 x i32> zeroinitializer
691  %res1 = select <8 x i1> %arg0, <8 x i64> %res0, <8 x i64> zeroinitializer
692  ret <8 x i64> %res1
693}
694
695define <8 x double> @test_mm512_broadcastsd_pd(<2 x double> %a0) {
696; CHECK-LABEL: test_mm512_broadcastsd_pd:
697; CHECK:       # %bb.0:
698; CHECK-NEXT:    vbroadcastsd %xmm0, %zmm0
699; CHECK-NEXT:    ret{{[l|q]}}
700  %res = shufflevector <2 x double> %a0, <2 x double> undef, <8 x i32> zeroinitializer
701  ret <8 x double> %res
702}
703
704define <8 x double> @test_mm512_mask_broadcastsd_pd(<8 x double> %a0, i8 %a1, <2 x double> %a2) {
705; X86-LABEL: test_mm512_mask_broadcastsd_pd:
706; X86:       # %bb.0:
707; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
708; X86-NEXT:    kmovw %eax, %k1
709; X86-NEXT:    vbroadcastsd %xmm1, %zmm0 {%k1}
710; X86-NEXT:    retl
711;
712; X64-LABEL: test_mm512_mask_broadcastsd_pd:
713; X64:       # %bb.0:
714; X64-NEXT:    kmovw %edi, %k1
715; X64-NEXT:    vbroadcastsd %xmm1, %zmm0 {%k1}
716; X64-NEXT:    retq
717  %arg1 = bitcast i8 %a1 to <8 x i1>
718  %res0 = shufflevector <2 x double> %a2, <2 x double> undef, <8 x i32> zeroinitializer
719  %res1 = select <8 x i1> %arg1, <8 x double> %res0, <8 x double> %a0
720  ret <8 x double> %res1
721}
722
723define <8 x double> @test_mm512_maskz_broadcastsd_pd(i8 %a0, <2 x double> %a1) {
724; X86-LABEL: test_mm512_maskz_broadcastsd_pd:
725; X86:       # %bb.0:
726; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
727; X86-NEXT:    kmovw %eax, %k1
728; X86-NEXT:    vbroadcastsd %xmm0, %zmm0 {%k1} {z}
729; X86-NEXT:    retl
730;
731; X64-LABEL: test_mm512_maskz_broadcastsd_pd:
732; X64:       # %bb.0:
733; X64-NEXT:    kmovw %edi, %k1
734; X64-NEXT:    vbroadcastsd %xmm0, %zmm0 {%k1} {z}
735; X64-NEXT:    retq
736  %arg0 = bitcast i8 %a0 to <8 x i1>
737  %res0 = shufflevector <2 x double> %a1, <2 x double> undef, <8 x i32> zeroinitializer
738  %res1 = select <8 x i1> %arg0, <8 x double> %res0, <8 x double> zeroinitializer
739  ret <8 x double> %res1
740}
741
742define <16 x float> @test_mm512_broadcastss_ps(<4 x float> %a0) {
743; CHECK-LABEL: test_mm512_broadcastss_ps:
744; CHECK:       # %bb.0:
745; CHECK-NEXT:    vbroadcastss %xmm0, %zmm0
746; CHECK-NEXT:    ret{{[l|q]}}
747  %res = shufflevector <4 x float> %a0, <4 x float> undef, <16 x i32> zeroinitializer
748  ret <16 x float> %res
749}
750
751define <16 x float> @test_mm512_mask_broadcastss_ps(<16 x float> %a0, i16 %a1, <4 x float> %a2) {
752; X86-LABEL: test_mm512_mask_broadcastss_ps:
753; X86:       # %bb.0:
754; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
755; X86-NEXT:    vbroadcastss %xmm1, %zmm0 {%k1}
756; X86-NEXT:    retl
757;
758; X64-LABEL: test_mm512_mask_broadcastss_ps:
759; X64:       # %bb.0:
760; X64-NEXT:    kmovw %edi, %k1
761; X64-NEXT:    vbroadcastss %xmm1, %zmm0 {%k1}
762; X64-NEXT:    retq
763  %arg1 = bitcast i16 %a1 to <16 x i1>
764  %res0 = shufflevector <4 x float> %a2, <4 x float> undef, <16 x i32> zeroinitializer
765  %res1 = select <16 x i1> %arg1, <16 x float> %res0, <16 x float> %a0
766  ret <16 x float> %res1
767}
768
769define <16 x float> @test_mm512_maskz_broadcastss_ps(i16 %a0, <4 x float> %a1) {
770; X86-LABEL: test_mm512_maskz_broadcastss_ps:
771; X86:       # %bb.0:
772; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
773; X86-NEXT:    vbroadcastss %xmm0, %zmm0 {%k1} {z}
774; X86-NEXT:    retl
775;
776; X64-LABEL: test_mm512_maskz_broadcastss_ps:
777; X64:       # %bb.0:
778; X64-NEXT:    kmovw %edi, %k1
779; X64-NEXT:    vbroadcastss %xmm0, %zmm0 {%k1} {z}
780; X64-NEXT:    retq
781  %arg0 = bitcast i16 %a0 to <16 x i1>
782  %res0 = shufflevector <4 x float> %a1, <4 x float> undef, <16 x i32> zeroinitializer
783  %res1 = select <16 x i1> %arg0, <16 x float> %res0, <16 x float> zeroinitializer
784  ret <16 x float> %res1
785}
786
787define <8 x double> @test_mm512_movedup_pd(<8 x double> %a0) {
788; CHECK-LABEL: test_mm512_movedup_pd:
789; CHECK:       # %bb.0:
790; CHECK-NEXT:    vmovddup {{.*#+}} zmm0 = zmm0[0,0,2,2,4,4,6,6]
791; CHECK-NEXT:    ret{{[l|q]}}
792  %res = shufflevector <8 x double> %a0, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
793  ret <8 x double> %res
794}
795
796define <8 x double> @test_mm512_mask_movedup_pd(<8 x double> %a0, i8 %a1, <8 x double> %a2) {
797; X86-LABEL: test_mm512_mask_movedup_pd:
798; X86:       # %bb.0:
799; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
800; X86-NEXT:    kmovw %eax, %k1
801; X86-NEXT:    vmovddup {{.*#+}} zmm0 {%k1} = zmm1[0,0,2,2,4,4,6,6]
802; X86-NEXT:    retl
803;
804; X64-LABEL: test_mm512_mask_movedup_pd:
805; X64:       # %bb.0:
806; X64-NEXT:    kmovw %edi, %k1
807; X64-NEXT:    vmovddup {{.*#+}} zmm0 {%k1} = zmm1[0,0,2,2,4,4,6,6]
808; X64-NEXT:    retq
809  %arg1 = bitcast i8 %a1 to <8 x i1>
810  %res0 = shufflevector <8 x double> %a2, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
811  %res1 = select <8 x i1> %arg1, <8 x double> %res0, <8 x double> %a0
812  ret <8 x double> %res1
813}
814
815define <8 x double> @test_mm512_maskz_movedup_pd(i8 %a0, <8 x double> %a1) {
816; X86-LABEL: test_mm512_maskz_movedup_pd:
817; X86:       # %bb.0:
818; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
819; X86-NEXT:    kmovw %eax, %k1
820; X86-NEXT:    vmovddup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6]
821; X86-NEXT:    retl
822;
823; X64-LABEL: test_mm512_maskz_movedup_pd:
824; X64:       # %bb.0:
825; X64-NEXT:    kmovw %edi, %k1
826; X64-NEXT:    vmovddup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6]
827; X64-NEXT:    retq
828  %arg0 = bitcast i8 %a0 to <8 x i1>
829  %res0 = shufflevector <8 x double> %a1, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
830  %res1 = select <8 x i1> %arg0, <8 x double> %res0, <8 x double> zeroinitializer
831  ret <8 x double> %res1
832}
833
834define <16 x float> @test_mm512_movehdup_ps(<16 x float> %a0) {
835; CHECK-LABEL: test_mm512_movehdup_ps:
836; CHECK:       # %bb.0:
837; CHECK-NEXT:    vmovshdup {{.*#+}} zmm0 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
838; CHECK-NEXT:    ret{{[l|q]}}
839  %res = shufflevector <16 x float> %a0, <16 x float> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>
840  ret <16 x float> %res
841}
842
843define <16 x float> @test_mm512_mask_movehdup_ps(<16 x float> %a0, i16 %a1, <16 x float> %a2) {
844; X86-LABEL: test_mm512_mask_movehdup_ps:
845; X86:       # %bb.0:
846; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
847; X86-NEXT:    vmovshdup {{.*#+}} zmm0 {%k1} = zmm1[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
848; X86-NEXT:    retl
849;
850; X64-LABEL: test_mm512_mask_movehdup_ps:
851; X64:       # %bb.0:
852; X64-NEXT:    kmovw %edi, %k1
853; X64-NEXT:    vmovshdup {{.*#+}} zmm0 {%k1} = zmm1[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
854; X64-NEXT:    retq
855  %arg1 = bitcast i16 %a1 to <16 x i1>
856  %res0 = shufflevector <16 x float> %a2, <16 x float> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>
857  %res1 = select <16 x i1> %arg1, <16 x float> %res0, <16 x float> %a0
858  ret <16 x float> %res1
859}
860
861define <16 x float> @test_mm512_maskz_movehdup_ps(i16 %a0, <16 x float> %a1) {
862; X86-LABEL: test_mm512_maskz_movehdup_ps:
863; X86:       # %bb.0:
864; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
865; X86-NEXT:    vmovshdup {{.*#+}} zmm0 {%k1} {z} = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
866; X86-NEXT:    retl
867;
868; X64-LABEL: test_mm512_maskz_movehdup_ps:
869; X64:       # %bb.0:
870; X64-NEXT:    kmovw %edi, %k1
871; X64-NEXT:    vmovshdup {{.*#+}} zmm0 {%k1} {z} = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
872; X64-NEXT:    retq
873  %arg0 = bitcast i16 %a0 to <16 x i1>
874  %res0 = shufflevector <16 x float> %a1, <16 x float> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>
875  %res1 = select <16 x i1> %arg0, <16 x float> %res0, <16 x float> zeroinitializer
876  ret <16 x float> %res1
877}
878
879define <16 x float> @test_mm512_moveldup_ps(<16 x float> %a0) {
880; CHECK-LABEL: test_mm512_moveldup_ps:
881; CHECK:       # %bb.0:
882; CHECK-NEXT:    vmovsldup {{.*#+}} zmm0 = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
883; CHECK-NEXT:    ret{{[l|q]}}
884  %res = shufflevector <16 x float> %a0, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
885  ret <16 x float> %res
886}
887
888define <16 x float> @test_mm512_mask_moveldup_ps(<16 x float> %a0, i16 %a1, <16 x float> %a2) {
889; X86-LABEL: test_mm512_mask_moveldup_ps:
890; X86:       # %bb.0:
891; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
892; X86-NEXT:    vmovsldup {{.*#+}} zmm0 {%k1} = zmm1[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
893; X86-NEXT:    retl
894;
895; X64-LABEL: test_mm512_mask_moveldup_ps:
896; X64:       # %bb.0:
897; X64-NEXT:    kmovw %edi, %k1
898; X64-NEXT:    vmovsldup {{.*#+}} zmm0 {%k1} = zmm1[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
899; X64-NEXT:    retq
900  %arg1 = bitcast i16 %a1 to <16 x i1>
901  %res0 = shufflevector <16 x float> %a2, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
902  %res1 = select <16 x i1> %arg1, <16 x float> %res0, <16 x float> %a0
903  ret <16 x float> %res1
904}
905
906define <16 x float> @test_mm512_maskz_moveldup_ps(i16 %a0, <16 x float> %a1) {
907; X86-LABEL: test_mm512_maskz_moveldup_ps:
908; X86:       # %bb.0:
909; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
910; X86-NEXT:    vmovsldup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
911; X86-NEXT:    retl
912;
913; X64-LABEL: test_mm512_maskz_moveldup_ps:
914; X64:       # %bb.0:
915; X64-NEXT:    kmovw %edi, %k1
916; X64-NEXT:    vmovsldup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
917; X64-NEXT:    retq
918  %arg0 = bitcast i16 %a0 to <16 x i1>
919  %res0 = shufflevector <16 x float> %a1, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
920  %res1 = select <16 x i1> %arg0, <16 x float> %res0, <16 x float> zeroinitializer
921  ret <16 x float> %res1
922}
923
924define <8 x double> @test_mm512_permute_pd(<8 x double> %a0) {
925; CHECK-LABEL: test_mm512_permute_pd:
926; CHECK:       # %bb.0:
927; CHECK-NEXT:    vpermilpd {{.*#+}} zmm0 = zmm0[0,1,2,2,4,4,6,6]
928; CHECK-NEXT:    ret{{[l|q]}}
929  %res = shufflevector <8 x double> %a0, <8 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
930  ret <8 x double> %res
931}
932
933define <8 x double> @test_mm512_mask_permute_pd(<8 x double> %a0, i8 %a1, <8 x double> %a2) {
934; X86-LABEL: test_mm512_mask_permute_pd:
935; X86:       # %bb.0:
936; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
937; X86-NEXT:    kmovw %eax, %k1
938; X86-NEXT:    vpermilpd {{.*#+}} zmm0 {%k1} = zmm1[0,1,2,2,4,4,6,6]
939; X86-NEXT:    retl
940;
941; X64-LABEL: test_mm512_mask_permute_pd:
942; X64:       # %bb.0:
943; X64-NEXT:    kmovw %edi, %k1
944; X64-NEXT:    vpermilpd {{.*#+}} zmm0 {%k1} = zmm1[0,1,2,2,4,4,6,6]
945; X64-NEXT:    retq
946  %arg1 = bitcast i8 %a1 to <8 x i1>
947  %res0 = shufflevector <8 x double> %a2, <8 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
948  %res1 = select <8 x i1> %arg1, <8 x double> %res0, <8 x double> %a0
949  ret <8 x double> %res1
950}
951
952define <8 x double> @test_mm512_maskz_permute_pd(i8 %a0, <8 x double> %a1) {
953; X86-LABEL: test_mm512_maskz_permute_pd:
954; X86:       # %bb.0:
955; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
956; X86-NEXT:    kmovw %eax, %k1
957; X86-NEXT:    vpermilpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,2,4,4,6,6]
958; X86-NEXT:    retl
959;
960; X64-LABEL: test_mm512_maskz_permute_pd:
961; X64:       # %bb.0:
962; X64-NEXT:    kmovw %edi, %k1
963; X64-NEXT:    vpermilpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,2,4,4,6,6]
964; X64-NEXT:    retq
965  %arg0 = bitcast i8 %a0 to <8 x i1>
966  %res0 = shufflevector <8 x double> %a1, <8 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
967  %res1 = select <8 x i1> %arg0, <8 x double> %res0, <8 x double> zeroinitializer
968  ret <8 x double> %res1
969}
970
971define <16 x float> @test_mm512_permute_ps(<16 x float> %a0) {
972; CHECK-LABEL: test_mm512_permute_ps:
973; CHECK:       # %bb.0:
974; CHECK-NEXT:    vpermilps {{.*#+}} zmm0 = zmm0[2,0,0,0,6,4,4,4,10,8,8,8,14,12,12,12]
975; CHECK-NEXT:    ret{{[l|q]}}
976  %res = shufflevector <16 x float> %a0, <16 x float> undef, <16 x i32> <i32 2, i32 0, i32 0, i32 0, i32 6, i32 4, i32 4, i32 4, i32 10, i32 8, i32 8, i32 8, i32 14, i32 12, i32 12, i32 12>
977  ret <16 x float> %res
978}
979
980define <16 x float> @test_mm512_mask_permute_ps(<16 x float> %a0, i16 %a1, <16 x float> %a2) {
981; X86-LABEL: test_mm512_mask_permute_ps:
982; X86:       # %bb.0:
983; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
984; X86-NEXT:    vpermilps {{.*#+}} zmm0 {%k1} = zmm1[2,0,0,0,6,4,4,4,10,8,8,8,14,12,12,12]
985; X86-NEXT:    retl
986;
987; X64-LABEL: test_mm512_mask_permute_ps:
988; X64:       # %bb.0:
989; X64-NEXT:    kmovw %edi, %k1
990; X64-NEXT:    vpermilps {{.*#+}} zmm0 {%k1} = zmm1[2,0,0,0,6,4,4,4,10,8,8,8,14,12,12,12]
991; X64-NEXT:    retq
992  %arg1 = bitcast i16 %a1 to <16 x i1>
993  %res0 = shufflevector <16 x float> %a2, <16 x float> undef, <16 x i32> <i32 2, i32 0, i32 0, i32 0, i32 6, i32 4, i32 4, i32 4, i32 10, i32 8, i32 8, i32 8, i32 14, i32 12, i32 12, i32 12>
994  %res1 = select <16 x i1> %arg1, <16 x float> %res0, <16 x float> %a0
995  ret <16 x float> %res1
996}
997
998define <16 x float> @test_mm512_maskz_permute_ps(i16 %a0, <16 x float> %a1) {
999; X86-LABEL: test_mm512_maskz_permute_ps:
1000; X86:       # %bb.0:
1001; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
1002; X86-NEXT:    vpermilps {{.*#+}} zmm0 {%k1} {z} = zmm0[2,0,0,0,6,4,4,4,10,8,8,8,14,12,12,12]
1003; X86-NEXT:    retl
1004;
1005; X64-LABEL: test_mm512_maskz_permute_ps:
1006; X64:       # %bb.0:
1007; X64-NEXT:    kmovw %edi, %k1
1008; X64-NEXT:    vpermilps {{.*#+}} zmm0 {%k1} {z} = zmm0[2,0,0,0,6,4,4,4,10,8,8,8,14,12,12,12]
1009; X64-NEXT:    retq
1010  %arg0 = bitcast i16 %a0 to <16 x i1>
1011  %res0 = shufflevector <16 x float> %a1, <16 x float> undef, <16 x i32> <i32 2, i32 0, i32 0, i32 0, i32 6, i32 4, i32 4, i32 4, i32 10, i32 8, i32 8, i32 8, i32 14, i32 12, i32 12, i32 12>
1012  %res1 = select <16 x i1> %arg0, <16 x float> %res0, <16 x float> zeroinitializer
1013  ret <16 x float> %res1
1014}
1015
1016define <8 x i64> @test_mm512_permutex_epi64(<8 x i64> %a0) {
1017; CHECK-LABEL: test_mm512_permutex_epi64:
1018; CHECK:       # %bb.0:
1019; CHECK-NEXT:    vpermpd {{.*#+}} zmm0 = zmm0[0,0,0,0,4,4,4,4]
1020; CHECK-NEXT:    ret{{[l|q]}}
1021  %res = shufflevector <8 x i64> %a0, <8 x i64> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
1022  ret <8 x i64> %res
1023}
1024
1025define <8 x i64> @test_mm512_mask_permutex_epi64(<8 x i64> %a0, i8 %a1, <8 x i64> %a2) {
1026; X86-LABEL: test_mm512_mask_permutex_epi64:
1027; X86:       # %bb.0:
1028; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
1029; X86-NEXT:    kmovw %eax, %k1
1030; X86-NEXT:    vpermq {{.*#+}} zmm0 {%k1} = zmm1[0,0,0,0,4,4,4,4]
1031; X86-NEXT:    retl
1032;
1033; X64-LABEL: test_mm512_mask_permutex_epi64:
1034; X64:       # %bb.0:
1035; X64-NEXT:    kmovw %edi, %k1
1036; X64-NEXT:    vpermq {{.*#+}} zmm0 {%k1} = zmm1[0,0,0,0,4,4,4,4]
1037; X64-NEXT:    retq
1038  %arg1 = bitcast i8 %a1 to <8 x i1>
1039  %res0 = shufflevector <8 x i64> %a2, <8 x i64> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
1040  %res1 = select <8 x i1> %arg1, <8 x i64> %res0, <8 x i64> %a0
1041  ret <8 x i64> %res1
1042}
1043
1044define <8 x i64> @test_mm512_maskz_permutex_epi64(i8 %a0, <8 x i64> %a1) {
1045; X86-LABEL: test_mm512_maskz_permutex_epi64:
1046; X86:       # %bb.0:
1047; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
1048; X86-NEXT:    kmovw %eax, %k1
1049; X86-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,0,0,4,4,4,4]
1050; X86-NEXT:    retl
1051;
1052; X64-LABEL: test_mm512_maskz_permutex_epi64:
1053; X64:       # %bb.0:
1054; X64-NEXT:    kmovw %edi, %k1
1055; X64-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,0,0,4,4,4,4]
1056; X64-NEXT:    retq
1057  %arg0 = bitcast i8 %a0 to <8 x i1>
1058  %res0 = shufflevector <8 x i64> %a1, <8 x i64> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
1059  %res1 = select <8 x i1> %arg0, <8 x i64> %res0, <8 x i64> zeroinitializer
1060  ret <8 x i64> %res1
1061}
1062
1063define <8 x double> @test_mm512_permutex_pd(<8 x double> %a0) {
1064; CHECK-LABEL: test_mm512_permutex_pd:
1065; CHECK:       # %bb.0:
1066; CHECK-NEXT:    vpermpd {{.*#+}} zmm0 = zmm0[0,0,0,0,4,4,4,4]
1067; CHECK-NEXT:    ret{{[l|q]}}
1068  %res = shufflevector <8 x double> %a0, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
1069  ret <8 x double> %res
1070}
1071
1072define <8 x double> @test_mm512_mask_permutex_pd(<8 x double> %a0, i8 %a1, <8 x double> %a2) {
1073; X86-LABEL: test_mm512_mask_permutex_pd:
1074; X86:       # %bb.0:
1075; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
1076; X86-NEXT:    kmovw %eax, %k1
1077; X86-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} = zmm1[0,0,0,0,4,4,4,4]
1078; X86-NEXT:    retl
1079;
1080; X64-LABEL: test_mm512_mask_permutex_pd:
1081; X64:       # %bb.0:
1082; X64-NEXT:    kmovw %edi, %k1
1083; X64-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} = zmm1[0,0,0,0,4,4,4,4]
1084; X64-NEXT:    retq
1085  %arg1 = bitcast i8 %a1 to <8 x i1>
1086  %res0 = shufflevector <8 x double> %a2, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
1087  %res1 = select <8 x i1> %arg1, <8 x double> %res0, <8 x double> %a0
1088  ret <8 x double> %res1
1089}
1090
1091define <8 x double> @test_mm512_maskz_permutex_pd(i8 %a0, <8 x double> %a1) {
1092; X86-LABEL: test_mm512_maskz_permutex_pd:
1093; X86:       # %bb.0:
1094; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
1095; X86-NEXT:    kmovw %eax, %k1
1096; X86-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,0,0,4,4,4,4]
1097; X86-NEXT:    retl
1098;
1099; X64-LABEL: test_mm512_maskz_permutex_pd:
1100; X64:       # %bb.0:
1101; X64-NEXT:    kmovw %edi, %k1
1102; X64-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,0,0,4,4,4,4]
1103; X64-NEXT:    retq
1104  %arg0 = bitcast i8 %a0 to <8 x i1>
1105  %res0 = shufflevector <8 x double> %a1, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
1106  %res1 = select <8 x i1> %arg0, <8 x double> %res0, <8 x double> zeroinitializer
1107  ret <8 x double> %res1
1108}
1109
1110define <8 x i64> @test_mm512_shuffle_epi32(<8 x i64> %a0) {
1111; CHECK-LABEL: test_mm512_shuffle_epi32:
1112; CHECK:       # %bb.0:
1113; CHECK-NEXT:    vpermilps {{.*#+}} zmm0 = zmm0[1,0,0,0,5,4,4,4,9,8,8,8,13,12,12,12]
1114; CHECK-NEXT:    ret{{[l|q]}}
1115  %arg0 = bitcast <8 x i64> %a0 to <16 x i32>
1116  %res0 = shufflevector <16 x i32> %arg0, <16 x i32> undef, <16 x i32> <i32 1, i32 0, i32 0, i32 0, i32 5, i32 4, i32 4, i32 4, i32 9, i32 8, i32 8, i32 8, i32 13, i32 12, i32 12, i32 12>
1117  %res1 = bitcast <16 x i32> %res0 to <8 x i64>
1118  ret <8 x i64> %res1
1119}
1120
1121define <8 x i64> @test_mm512_mask_shuffle_epi32(<8 x i64> %a0, i16 %a1, <8 x i64> %a2) {
1122; X86-LABEL: test_mm512_mask_shuffle_epi32:
1123; X86:       # %bb.0:
1124; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
1125; X86-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} = zmm1[1,0,0,0,5,4,4,4,9,8,8,8,13,12,12,12]
1126; X86-NEXT:    retl
1127;
1128; X64-LABEL: test_mm512_mask_shuffle_epi32:
1129; X64:       # %bb.0:
1130; X64-NEXT:    kmovw %edi, %k1
1131; X64-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} = zmm1[1,0,0,0,5,4,4,4,9,8,8,8,13,12,12,12]
1132; X64-NEXT:    retq
1133  %arg0 = bitcast <8 x i64> %a0 to <16 x i32>
1134  %arg1 = bitcast i16 %a1 to <16 x i1>
1135  %arg2 = bitcast <8 x i64> %a2 to <16 x i32>
1136  %res0 = shufflevector <16 x i32> %arg2, <16 x i32> undef, <16 x i32> <i32 1, i32 0, i32 0, i32 0, i32 5, i32 4, i32 4, i32 4, i32 9, i32 8, i32 8, i32 8, i32 13, i32 12, i32 12, i32 12>
1137  %res1 = select <16 x i1> %arg1, <16 x i32> %res0, <16 x i32> %arg0
1138  %res2 = bitcast <16 x i32> %res1 to <8 x i64>
1139  ret <8 x i64> %res2
1140}
1141
1142define <8 x i64> @test_mm512_maskz_shuffle_epi32(i16 %a0, <8 x i64> %a1) {
1143; X86-LABEL: test_mm512_maskz_shuffle_epi32:
1144; X86:       # %bb.0:
1145; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
1146; X86-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[1,0,0,0,5,4,4,4,9,8,8,8,13,12,12,12]
1147; X86-NEXT:    retl
1148;
1149; X64-LABEL: test_mm512_maskz_shuffle_epi32:
1150; X64:       # %bb.0:
1151; X64-NEXT:    kmovw %edi, %k1
1152; X64-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[1,0,0,0,5,4,4,4,9,8,8,8,13,12,12,12]
1153; X64-NEXT:    retq
1154  %arg0 = bitcast i16 %a0 to <16 x i1>
1155  %arg1 = bitcast <8 x i64> %a1 to <16 x i32>
1156  %res0 = shufflevector <16 x i32> %arg1, <16 x i32> undef, <16 x i32> <i32 1, i32 0, i32 0, i32 0, i32 5, i32 4, i32 4, i32 4, i32 9, i32 8, i32 8, i32 8, i32 13, i32 12, i32 12, i32 12>
1157  %res1 = select <16 x i1> %arg0, <16 x i32> %res0, <16 x i32> zeroinitializer
1158  %res2 = bitcast <16 x i32> %res1 to <8 x i64>
1159  ret <8 x i64> %res2
1160}
1161
1162define <8 x double> @test_mm512_shuffle_pd(<8 x double> %a0, <8 x double> %a1) {
1163; CHECK-LABEL: test_mm512_shuffle_pd:
1164; CHECK:       # %bb.0:
1165; CHECK-NEXT:    vshufpd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[3],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
1166; CHECK-NEXT:    ret{{[l|q]}}
1167  %res = shufflevector <8 x double> %a0, <8 x double> %a1, <8 x i32> <i32 0, i32 8, i32 3, i32 10, i32 4, i32 12, i32 6, i32 14>
1168  ret <8 x double> %res
1169}
1170
1171define <8 x double> @test_mm512_mask_shuffle_pd(<8 x double> %a0, i8 %a1, <8 x double> %a2, <8 x double> %a3) {
1172; X86-LABEL: test_mm512_mask_shuffle_pd:
1173; X86:       # %bb.0:
1174; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
1175; X86-NEXT:    kmovw %eax, %k1
1176; X86-NEXT:    vshufpd {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[3],zmm2[2],zmm1[4],zmm2[4],zmm1[6],zmm2[6]
1177; X86-NEXT:    retl
1178;
1179; X64-LABEL: test_mm512_mask_shuffle_pd:
1180; X64:       # %bb.0:
1181; X64-NEXT:    kmovw %edi, %k1
1182; X64-NEXT:    vshufpd {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[3],zmm2[2],zmm1[4],zmm2[4],zmm1[6],zmm2[6]
1183; X64-NEXT:    retq
1184  %arg1 = bitcast i8 %a1 to <8 x i1>
1185  %res0 = shufflevector <8 x double> %a2, <8 x double> %a3, <8 x i32> <i32 0, i32 8, i32 3, i32 10, i32 4, i32 12, i32 6, i32 14>
1186  %res1 = select <8 x i1> %arg1, <8 x double> %res0, <8 x double> %a0
1187  ret <8 x double> %res1
1188}
1189
1190define <8 x double> @test_mm512_maskz_shuffle_pd(i8 %a0, <8 x double> %a1, <8 x double> %a2) {
1191; X86-LABEL: test_mm512_maskz_shuffle_pd:
1192; X86:       # %bb.0:
1193; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
1194; X86-NEXT:    kmovw %eax, %k1
1195; X86-NEXT:    vshufpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[3],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
1196; X86-NEXT:    retl
1197;
1198; X64-LABEL: test_mm512_maskz_shuffle_pd:
1199; X64:       # %bb.0:
1200; X64-NEXT:    kmovw %edi, %k1
1201; X64-NEXT:    vshufpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[3],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
1202; X64-NEXT:    retq
1203  %arg0 = bitcast i8 %a0 to <8 x i1>
1204  %res0 = shufflevector <8 x double> %a1, <8 x double> %a2, <8 x i32> <i32 0, i32 8, i32 3, i32 10, i32 4, i32 12, i32 6, i32 14>
1205  %res1 = select <8 x i1> %arg0, <8 x double> %res0, <8 x double> zeroinitializer
1206  ret <8 x double> %res1
1207}
1208
1209define <8 x i64> @test_mm512_unpackhi_epi32(<8 x i64> %a0, <8 x i64> %a1) {
1210; CHECK-LABEL: test_mm512_unpackhi_epi32:
1211; CHECK:       # %bb.0:
1212; CHECK-NEXT:    vunpckhps {{.*#+}} zmm0 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
1213; CHECK-NEXT:    ret{{[l|q]}}
1214  %arg0 = bitcast <8 x i64> %a0 to <16 x i32>
1215  %arg1 = bitcast <8 x i64> %a1 to <16 x i32>
1216  %res0 = shufflevector <16 x i32> %arg0, <16 x i32> %arg1, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
1217  %res1 = bitcast <16 x i32> %res0 to <8 x i64>
1218  ret <8 x i64> %res1
1219}
1220
1221define <8 x i64> @test_mm512_mask_unpackhi_epi32(<8 x i64> %a0, i16 %a1, <8 x i64> %a2, <8 x i64> %a3) {
1222; X86-LABEL: test_mm512_mask_unpackhi_epi32:
1223; X86:       # %bb.0:
1224; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
1225; X86-NEXT:    vpunpckhdq {{.*#+}} zmm0 {%k1} = zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[14],zmm2[14],zmm1[15],zmm2[15]
1226; X86-NEXT:    retl
1227;
1228; X64-LABEL: test_mm512_mask_unpackhi_epi32:
1229; X64:       # %bb.0:
1230; X64-NEXT:    kmovw %edi, %k1
1231; X64-NEXT:    vpunpckhdq {{.*#+}} zmm0 {%k1} = zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[14],zmm2[14],zmm1[15],zmm2[15]
1232; X64-NEXT:    retq
1233  %arg0 = bitcast <8 x i64> %a0 to <16 x i32>
1234  %arg1 = bitcast i16 %a1 to <16 x i1>
1235  %arg2 = bitcast <8 x i64> %a2 to <16 x i32>
1236  %arg3 = bitcast <8 x i64> %a3 to <16 x i32>
1237  %res0 = shufflevector <16 x i32> %arg2, <16 x i32> %arg3, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
1238  %res1 = select <16 x i1> %arg1, <16 x i32> %res0, <16 x i32> %arg0
1239  %res2 = bitcast <16 x i32> %res1 to <8 x i64>
1240  ret <8 x i64> %res2
1241}
1242
1243define <8 x i64> @test_mm512_maskz_unpackhi_epi32(i16 %a0, <8 x i64> %a1, <8 x i64> %a2) {
1244; X86-LABEL: test_mm512_maskz_unpackhi_epi32:
1245; X86:       # %bb.0:
1246; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
1247; X86-NEXT:    vpunpckhdq {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
1248; X86-NEXT:    retl
1249;
1250; X64-LABEL: test_mm512_maskz_unpackhi_epi32:
1251; X64:       # %bb.0:
1252; X64-NEXT:    kmovw %edi, %k1
1253; X64-NEXT:    vpunpckhdq {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
1254; X64-NEXT:    retq
1255  %arg0 = bitcast i16 %a0 to <16 x i1>
1256  %arg1 = bitcast <8 x i64> %a1 to <16 x i32>
1257  %arg2 = bitcast <8 x i64> %a2 to <16 x i32>
1258  %res0 = shufflevector <16 x i32> %arg1, <16 x i32> %arg2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
1259  %res1 = select <16 x i1> %arg0, <16 x i32> %res0, <16 x i32> zeroinitializer
1260  %res2 = bitcast <16 x i32> %res1 to <8 x i64>
1261  ret <8 x i64> %res2
1262}
1263
1264define <8 x i64> @test_mm512_unpackhi_epi64(<8 x i64> %a0, <8 x i64> %a1) {
1265; CHECK-LABEL: test_mm512_unpackhi_epi64:
1266; CHECK:       # %bb.0:
1267; CHECK-NEXT:    vunpckhpd {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
1268; CHECK-NEXT:    ret{{[l|q]}}
1269  %res = shufflevector <8 x i64> %a0, <8 x i64> %a1, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
1270  ret <8 x i64> %res
1271}
1272
1273define <8 x i64> @test_mm512_mask_unpackhi_epi64(<8 x i64> %a0, i8 %a1, <8 x i64> %a2, <8 x i64> %a3) {
1274; X86-LABEL: test_mm512_mask_unpackhi_epi64:
1275; X86:       # %bb.0:
1276; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
1277; X86-NEXT:    kmovw %eax, %k1
1278; X86-NEXT:    vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm1[1],zmm2[1],zmm1[3],zmm2[3],zmm1[5],zmm2[5],zmm1[7],zmm2[7]
1279; X86-NEXT:    retl
1280;
1281; X64-LABEL: test_mm512_mask_unpackhi_epi64:
1282; X64:       # %bb.0:
1283; X64-NEXT:    kmovw %edi, %k1
1284; X64-NEXT:    vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm1[1],zmm2[1],zmm1[3],zmm2[3],zmm1[5],zmm2[5],zmm1[7],zmm2[7]
1285; X64-NEXT:    retq
1286  %arg1 = bitcast i8 %a1 to <8 x i1>
1287  %res0 = shufflevector <8 x i64> %a2, <8 x i64> %a3, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
1288  %res1 = select <8 x i1> %arg1, <8 x i64> %res0, <8 x i64> %a0
1289  ret <8 x i64> %res1
1290}
1291
1292define <8 x i64> @test_mm512_maskz_unpackhi_epi64(i8 %a0, <8 x i64> %a1, <8 x i64> %a2) {
1293; X86-LABEL: test_mm512_maskz_unpackhi_epi64:
1294; X86:       # %bb.0:
1295; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
1296; X86-NEXT:    kmovw %eax, %k1
1297; X86-NEXT:    vpunpckhqdq {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
1298; X86-NEXT:    retl
1299;
1300; X64-LABEL: test_mm512_maskz_unpackhi_epi64:
1301; X64:       # %bb.0:
1302; X64-NEXT:    kmovw %edi, %k1
1303; X64-NEXT:    vpunpckhqdq {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
1304; X64-NEXT:    retq
1305  %arg0 = bitcast i8 %a0 to <8 x i1>
1306  %res0 = shufflevector <8 x i64> %a1, <8 x i64> %a2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
1307  %res1 = select <8 x i1> %arg0, <8 x i64> %res0, <8 x i64> zeroinitializer
1308  ret <8 x i64> %res1
1309}
1310
1311define <8 x double> @test_mm512_unpackhi_pd(<8 x double> %a0, <8 x double> %a1) {
1312; CHECK-LABEL: test_mm512_unpackhi_pd:
1313; CHECK:       # %bb.0:
1314; CHECK-NEXT:    vunpckhpd {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
1315; CHECK-NEXT:    ret{{[l|q]}}
1316  %res = shufflevector <8 x double> %a0, <8 x double> %a1, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
1317  ret <8 x double> %res
1318}
1319
1320define <8 x double> @test_mm512_mask_unpackhi_pd(<8 x double> %a0, i8 %a1, <8 x double> %a2, <8 x double> %a3) {
1321; X86-LABEL: test_mm512_mask_unpackhi_pd:
1322; X86:       # %bb.0:
1323; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
1324; X86-NEXT:    kmovw %eax, %k1
1325; X86-NEXT:    vunpckhpd {{.*#+}} zmm0 {%k1} = zmm1[1],zmm2[1],zmm1[3],zmm2[3],zmm1[5],zmm2[5],zmm1[7],zmm2[7]
1326; X86-NEXT:    retl
1327;
1328; X64-LABEL: test_mm512_mask_unpackhi_pd:
1329; X64:       # %bb.0:
1330; X64-NEXT:    kmovw %edi, %k1
1331; X64-NEXT:    vunpckhpd {{.*#+}} zmm0 {%k1} = zmm1[1],zmm2[1],zmm1[3],zmm2[3],zmm1[5],zmm2[5],zmm1[7],zmm2[7]
1332; X64-NEXT:    retq
1333  %arg1 = bitcast i8 %a1 to <8 x i1>
1334  %res0 = shufflevector <8 x double> %a2, <8 x double> %a3, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
1335  %res1 = select <8 x i1> %arg1, <8 x double> %res0, <8 x double> %a0
1336  ret <8 x double> %res1
1337}
1338
1339define <8 x double> @test_mm512_maskz_unpackhi_pd(i8 %a0, <8 x double> %a1, <8 x double> %a2) {
1340; X86-LABEL: test_mm512_maskz_unpackhi_pd:
1341; X86:       # %bb.0:
1342; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
1343; X86-NEXT:    kmovw %eax, %k1
1344; X86-NEXT:    vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
1345; X86-NEXT:    retl
1346;
1347; X64-LABEL: test_mm512_maskz_unpackhi_pd:
1348; X64:       # %bb.0:
1349; X64-NEXT:    kmovw %edi, %k1
1350; X64-NEXT:    vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
1351; X64-NEXT:    retq
1352  %arg0 = bitcast i8 %a0 to <8 x i1>
1353  %res0 = shufflevector <8 x double> %a1, <8 x double> %a2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
1354  %res1 = select <8 x i1> %arg0, <8 x double> %res0, <8 x double> zeroinitializer
1355  ret <8 x double> %res1
1356}
1357
1358define <16 x float> @test_mm512_unpackhi_ps(<16 x float> %a0, <16 x float> %a1) {
1359; CHECK-LABEL: test_mm512_unpackhi_ps:
1360; CHECK:       # %bb.0:
1361; CHECK-NEXT:    vunpckhps {{.*#+}} zmm0 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
1362; CHECK-NEXT:    ret{{[l|q]}}
1363  %res = shufflevector <16 x float> %a0, <16 x float> %a1, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
1364  ret <16 x float> %res
1365}
1366
1367define <16 x float> @test_mm512_mask_unpackhi_ps(<16 x float> %a0, i16 %a1, <16 x float> %a2, <16 x float> %a3) {
1368; X86-LABEL: test_mm512_mask_unpackhi_ps:
1369; X86:       # %bb.0:
1370; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
1371; X86-NEXT:    vunpckhps {{.*#+}} zmm0 {%k1} = zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[14],zmm2[14],zmm1[15],zmm2[15]
1372; X86-NEXT:    retl
1373;
1374; X64-LABEL: test_mm512_mask_unpackhi_ps:
1375; X64:       # %bb.0:
1376; X64-NEXT:    kmovw %edi, %k1
1377; X64-NEXT:    vunpckhps {{.*#+}} zmm0 {%k1} = zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[14],zmm2[14],zmm1[15],zmm2[15]
1378; X64-NEXT:    retq
1379  %arg1 = bitcast i16 %a1 to <16 x i1>
1380  %res0 = shufflevector <16 x float> %a2, <16 x float> %a3, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
1381  %res1 = select <16 x i1> %arg1, <16 x float> %res0, <16 x float> %a0
1382  ret <16 x float> %res1
1383}
1384
1385define <16 x float> @test_mm512_maskz_unpackhi_ps(i16 %a0, <16 x float> %a1, <16 x float> %a2) {
1386; X86-LABEL: test_mm512_maskz_unpackhi_ps:
1387; X86:       # %bb.0:
1388; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
1389; X86-NEXT:    vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
1390; X86-NEXT:    retl
1391;
1392; X64-LABEL: test_mm512_maskz_unpackhi_ps:
1393; X64:       # %bb.0:
1394; X64-NEXT:    kmovw %edi, %k1
1395; X64-NEXT:    vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
1396; X64-NEXT:    retq
1397  %arg0 = bitcast i16 %a0 to <16 x i1>
1398  %res0 = shufflevector <16 x float> %a1, <16 x float> %a2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
1399  %res1 = select <16 x i1> %arg0, <16 x float> %res0, <16 x float> zeroinitializer
1400  ret <16 x float> %res1
1401}
1402
1403define <8 x i64> @test_mm512_unpacklo_epi32(<8 x i64> %a0, <8 x i64> %a1) {
1404; CHECK-LABEL: test_mm512_unpacklo_epi32:
1405; CHECK:       # %bb.0:
1406; CHECK-NEXT:    vunpcklps {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
1407; CHECK-NEXT:    ret{{[l|q]}}
1408  %arg0 = bitcast <8 x i64> %a0 to <16 x i32>
1409  %arg1 = bitcast <8 x i64> %a1 to <16 x i32>
1410  %res0 = shufflevector <16 x i32> %arg0, <16 x i32> %arg1, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
1411  %res1 = bitcast <16 x i32> %res0 to <8 x i64>
1412  ret <8 x i64> %res1
1413}
1414
1415define <8 x i64> @test_mm512_mask_unpacklo_epi32(<8 x i64> %a0, i16 %a1, <8 x i64> %a2, <8 x i64> %a3) {
1416; X86-LABEL: test_mm512_mask_unpacklo_epi32:
1417; X86:       # %bb.0:
1418; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
1419; X86-NEXT:    vpunpckldq {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[12],zmm2[12],zmm1[13],zmm2[13]
1420; X86-NEXT:    retl
1421;
1422; X64-LABEL: test_mm512_mask_unpacklo_epi32:
1423; X64:       # %bb.0:
1424; X64-NEXT:    kmovw %edi, %k1
1425; X64-NEXT:    vpunpckldq {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[12],zmm2[12],zmm1[13],zmm2[13]
1426; X64-NEXT:    retq
1427  %arg0 = bitcast <8 x i64> %a0 to <16 x i32>
1428  %arg1 = bitcast i16 %a1 to <16 x i1>
1429  %arg2 = bitcast <8 x i64> %a2 to <16 x i32>
1430  %arg3 = bitcast <8 x i64> %a3 to <16 x i32>
1431  %res0 = shufflevector <16 x i32> %arg2, <16 x i32> %arg3, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
1432  %res1 = select <16 x i1> %arg1, <16 x i32> %res0, <16 x i32> %arg0
1433  %res2 = bitcast <16 x i32> %res1 to <8 x i64>
1434  ret <8 x i64> %res2
1435}
1436
1437define <8 x i64> @test_mm512_maskz_unpacklo_epi32(i16 %a0, <8 x i64> %a1, <8 x i64> %a2) {
1438; X86-LABEL: test_mm512_maskz_unpacklo_epi32:
1439; X86:       # %bb.0:
1440; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
1441; X86-NEXT:    vpunpckldq {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
1442; X86-NEXT:    retl
1443;
1444; X64-LABEL: test_mm512_maskz_unpacklo_epi32:
1445; X64:       # %bb.0:
1446; X64-NEXT:    kmovw %edi, %k1
1447; X64-NEXT:    vpunpckldq {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
1448; X64-NEXT:    retq
1449  %arg0 = bitcast i16 %a0 to <16 x i1>
1450  %arg1 = bitcast <8 x i64> %a1 to <16 x i32>
1451  %arg2 = bitcast <8 x i64> %a2 to <16 x i32>
1452  %res0 = shufflevector <16 x i32> %arg1, <16 x i32> %arg2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
1453  %res1 = select <16 x i1> %arg0, <16 x i32> %res0, <16 x i32> zeroinitializer
1454  %res2 = bitcast <16 x i32> %res1 to <8 x i64>
1455  ret <8 x i64> %res2
1456}
1457
1458define <8 x i64> @test_mm512_unpacklo_epi64(<8 x i64> %a0, <8 x i64> %a1) {
1459; CHECK-LABEL: test_mm512_unpacklo_epi64:
1460; CHECK:       # %bb.0:
1461; CHECK-NEXT:    vunpcklpd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
1462; CHECK-NEXT:    ret{{[l|q]}}
1463  %res = shufflevector <8 x i64> %a0, <8 x i64> %a1, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
1464  ret <8 x i64> %res
1465}
1466
1467define <8 x i64> @test_mm512_mask_unpacklo_epi64(<8 x i64> %a0, i8 %a1, <8 x i64> %a2, <8 x i64> %a3) {
1468; X86-LABEL: test_mm512_mask_unpacklo_epi64:
1469; X86:       # %bb.0:
1470; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
1471; X86-NEXT:    kmovw %eax, %k1
1472; X86-NEXT:    vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[2],zmm2[2],zmm1[4],zmm2[4],zmm1[6],zmm2[6]
1473; X86-NEXT:    retl
1474;
1475; X64-LABEL: test_mm512_mask_unpacklo_epi64:
1476; X64:       # %bb.0:
1477; X64-NEXT:    kmovw %edi, %k1
1478; X64-NEXT:    vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[2],zmm2[2],zmm1[4],zmm2[4],zmm1[6],zmm2[6]
1479; X64-NEXT:    retq
1480  %arg1 = bitcast i8 %a1 to <8 x i1>
1481  %res0 = shufflevector <8 x i64> %a2, <8 x i64> %a3, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
1482  %res1 = select <8 x i1> %arg1, <8 x i64> %res0, <8 x i64> %a0
1483  ret <8 x i64> %res1
1484}
1485
1486define <8 x i64> @test_mm512_maskz_unpacklo_epi64(i8 %a0, <8 x i64> %a1, <8 x i64> %a2) {
1487; X86-LABEL: test_mm512_maskz_unpacklo_epi64:
1488; X86:       # %bb.0:
1489; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
1490; X86-NEXT:    kmovw %eax, %k1
1491; X86-NEXT:    vpunpcklqdq {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
1492; X86-NEXT:    retl
1493;
1494; X64-LABEL: test_mm512_maskz_unpacklo_epi64:
1495; X64:       # %bb.0:
1496; X64-NEXT:    kmovw %edi, %k1
1497; X64-NEXT:    vpunpcklqdq {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
1498; X64-NEXT:    retq
1499  %arg0 = bitcast i8 %a0 to <8 x i1>
1500  %res0 = shufflevector <8 x i64> %a1, <8 x i64> %a2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
1501  %res1 = select <8 x i1> %arg0, <8 x i64> %res0, <8 x i64> zeroinitializer
1502  ret <8 x i64> %res1
1503}
1504
1505define <8 x double> @test_mm512_unpacklo_pd(<8 x double> %a0, <8 x double> %a1) {
1506; CHECK-LABEL: test_mm512_unpacklo_pd:
1507; CHECK:       # %bb.0:
1508; CHECK-NEXT:    vunpcklpd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
1509; CHECK-NEXT:    ret{{[l|q]}}
1510  %res = shufflevector <8 x double> %a0, <8 x double> %a1, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
1511  ret <8 x double> %res
1512}
1513
1514define <8 x double> @test_mm512_mask_unpacklo_pd(<8 x double> %a0, i8 %a1, <8 x double> %a2, <8 x double> %a3) {
1515; X86-LABEL: test_mm512_mask_unpacklo_pd:
1516; X86:       # %bb.0:
1517; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
1518; X86-NEXT:    kmovw %eax, %k1
1519; X86-NEXT:    vunpcklpd {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[2],zmm2[2],zmm1[4],zmm2[4],zmm1[6],zmm2[6]
1520; X86-NEXT:    retl
1521;
1522; X64-LABEL: test_mm512_mask_unpacklo_pd:
1523; X64:       # %bb.0:
1524; X64-NEXT:    kmovw %edi, %k1
1525; X64-NEXT:    vunpcklpd {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[2],zmm2[2],zmm1[4],zmm2[4],zmm1[6],zmm2[6]
1526; X64-NEXT:    retq
1527  %arg1 = bitcast i8 %a1 to <8 x i1>
1528  %res0 = shufflevector <8 x double> %a2, <8 x double> %a3, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
1529  %res1 = select <8 x i1> %arg1, <8 x double> %res0, <8 x double> %a0
1530  ret <8 x double> %res1
1531}
1532
1533define <8 x double> @test_mm512_maskz_unpacklo_pd(i8 %a0, <8 x double> %a1, <8 x double> %a2) {
1534; X86-LABEL: test_mm512_maskz_unpacklo_pd:
1535; X86:       # %bb.0:
1536; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
1537; X86-NEXT:    kmovw %eax, %k1
1538; X86-NEXT:    vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
1539; X86-NEXT:    retl
1540;
1541; X64-LABEL: test_mm512_maskz_unpacklo_pd:
1542; X64:       # %bb.0:
1543; X64-NEXT:    kmovw %edi, %k1
1544; X64-NEXT:    vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
1545; X64-NEXT:    retq
1546  %arg0 = bitcast i8 %a0 to <8 x i1>
1547  %res0 = shufflevector <8 x double> %a1, <8 x double> %a2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
1548  %res1 = select <8 x i1> %arg0, <8 x double> %res0, <8 x double> zeroinitializer
1549  ret <8 x double> %res1
1550}
1551
1552define <16 x float> @test_mm512_unpacklo_ps(<16 x float> %a0, <16 x float> %a1) {
1553; CHECK-LABEL: test_mm512_unpacklo_ps:
1554; CHECK:       # %bb.0:
1555; CHECK-NEXT:    vunpcklps {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
1556; CHECK-NEXT:    ret{{[l|q]}}
1557  %res = shufflevector <16 x float> %a0, <16 x float> %a1, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
1558  ret <16 x float> %res
1559}
1560
1561define <16 x float> @test_mm512_mask_unpacklo_ps(<16 x float> %a0, i16 %a1, <16 x float> %a2, <16 x float> %a3) {
1562; X86-LABEL: test_mm512_mask_unpacklo_ps:
1563; X86:       # %bb.0:
1564; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
1565; X86-NEXT:    vunpcklps {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[12],zmm2[12],zmm1[13],zmm2[13]
1566; X86-NEXT:    retl
1567;
1568; X64-LABEL: test_mm512_mask_unpacklo_ps:
1569; X64:       # %bb.0:
1570; X64-NEXT:    kmovw %edi, %k1
1571; X64-NEXT:    vunpcklps {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[12],zmm2[12],zmm1[13],zmm2[13]
1572; X64-NEXT:    retq
1573  %arg1 = bitcast i16 %a1 to <16 x i1>
1574  %res0 = shufflevector <16 x float> %a2, <16 x float> %a3, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
1575  %res1 = select <16 x i1> %arg1, <16 x float> %res0, <16 x float> %a0
1576  ret <16 x float> %res1
1577}
1578
1579define <16 x float> @test_mm512_maskz_unpacklo_ps(i16 %a0, <16 x float> %a1, <16 x float> %a2) {
1580; X86-LABEL: test_mm512_maskz_unpacklo_ps:
1581; X86:       # %bb.0:
1582; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
1583; X86-NEXT:    vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
1584; X86-NEXT:    retl
1585;
1586; X64-LABEL: test_mm512_maskz_unpacklo_ps:
1587; X64:       # %bb.0:
1588; X64-NEXT:    kmovw %edi, %k1
1589; X64-NEXT:    vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
1590; X64-NEXT:    retq
1591  %arg0 = bitcast i16 %a0 to <16 x i1>
1592  %res0 = shufflevector <16 x float> %a1, <16 x float> %a2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
1593  %res1 = select <16 x i1> %arg0, <16 x float> %res0, <16 x float> zeroinitializer
1594  ret <16 x float> %res1
1595}
1596
1597define <8 x double> @test_mm512_zextpd128_pd512(<2 x double> %a0) nounwind {
1598; CHECK-LABEL: test_mm512_zextpd128_pd512:
1599; CHECK:       # %bb.0:
1600; CHECK-NEXT:    vmovaps %xmm0, %xmm0
1601; CHECK-NEXT:    ret{{[l|q]}}
1602  %res = shufflevector <2 x double> %a0, <2 x double> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
1603  ret <8 x double> %res
1604}
1605
1606define <8 x double> @test_mm512_zextpd256_pd512(<4 x double> %a0) nounwind {
1607; CHECK-LABEL: test_mm512_zextpd256_pd512:
1608; CHECK:       # %bb.0:
1609; CHECK-NEXT:    vmovaps %ymm0, %ymm0
1610; CHECK-NEXT:    ret{{[l|q]}}
1611  %res = shufflevector <4 x double> %a0, <4 x double> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1612  ret <8 x double> %res
1613}
1614
1615define <16 x float> @test_mm512_zextps128_ps512(<4 x float> %a0) nounwind {
1616; CHECK-LABEL: test_mm512_zextps128_ps512:
1617; CHECK:       # %bb.0:
1618; CHECK-NEXT:    vmovaps %xmm0, %xmm0
1619; CHECK-NEXT:    ret{{[l|q]}}
1620  %res = shufflevector <4 x float> %a0, <4 x float> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
1621  ret <16 x float> %res
1622}
1623
1624define <16 x float> @test_mm512_zextps256_ps512(<8 x float> %a0) nounwind {
1625; CHECK-LABEL: test_mm512_zextps256_ps512:
1626; CHECK:       # %bb.0:
1627; CHECK-NEXT:    vmovaps %ymm0, %ymm0
1628; CHECK-NEXT:    ret{{[l|q]}}
1629  %res = shufflevector <8 x float> %a0, <8 x float> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1630  ret <16 x float> %res
1631}
1632
1633define <8 x i64> @test_mm512_zextsi128_si512(<2 x i64> %a0) nounwind {
1634; CHECK-LABEL: test_mm512_zextsi128_si512:
1635; CHECK:       # %bb.0:
1636; CHECK-NEXT:    vmovaps %xmm0, %xmm0
1637; CHECK-NEXT:    ret{{[l|q]}}
1638  %res = shufflevector <2 x i64> %a0, <2 x i64> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
1639  ret <8 x i64> %res
1640}
1641
1642define <8 x i64> @test_mm512_zextsi256_si512(<4 x i64> %a0) nounwind {
1643; CHECK-LABEL: test_mm512_zextsi256_si512:
1644; CHECK:       # %bb.0:
1645; CHECK-NEXT:    vmovaps %ymm0, %ymm0
1646; CHECK-NEXT:    ret{{[l|q]}}
1647  %res = shufflevector <4 x i64> %a0, <4 x i64> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1648  ret <8 x i64> %res
1649}
1650
1651define <8 x i64> @test_mm512_mul_epi32(<8 x i64> %__A, <8 x i64> %__B) nounwind {
1652; CHECK-LABEL: test_mm512_mul_epi32:
1653; CHECK:       # %bb.0:
1654; CHECK-NEXT:    vpsllq $32, %zmm0, %zmm0
1655; CHECK-NEXT:    vpsraq $32, %zmm0, %zmm0
1656; CHECK-NEXT:    vpsllq $32, %zmm1, %zmm1
1657; CHECK-NEXT:    vpsraq $32, %zmm1, %zmm1
1658; CHECK-NEXT:    vpmuldq %zmm0, %zmm1, %zmm0
1659; CHECK-NEXT:    ret{{[l|q]}}
1660  %tmp = shl <8 x i64> %__A, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
1661  %tmp1 = ashr exact <8 x i64> %tmp, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
1662  %tmp2 = shl <8 x i64> %__B, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
1663  %tmp3 = ashr exact <8 x i64> %tmp2, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
1664  %tmp4 = mul nsw <8 x i64> %tmp3, %tmp1
1665  ret <8 x i64> %tmp4
1666}
1667
1668define <8 x i64> @test_mm512_maskz_mul_epi32(i8 zeroext %__k, <8 x i64> %__A, <8 x i64> %__B) nounwind {
1669; X86-LABEL: test_mm512_maskz_mul_epi32:
1670; X86:       # %bb.0: # %entry
1671; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
1672; X86-NEXT:    kmovw %eax, %k1
1673; X86-NEXT:    vpmuldq %zmm0, %zmm1, %zmm0 {%k1} {z}
1674; X86-NEXT:    retl
1675;
1676; X64-LABEL: test_mm512_maskz_mul_epi32:
1677; X64:       # %bb.0: # %entry
1678; X64-NEXT:    kmovw %edi, %k1
1679; X64-NEXT:    vpmuldq %zmm0, %zmm1, %zmm0 {%k1} {z}
1680; X64-NEXT:    retq
1681entry:
1682  %0 = shl <8 x i64> %__A, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
1683  %1 = ashr exact <8 x i64> %0, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
1684  %2 = shl <8 x i64> %__B, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
1685  %3 = ashr exact <8 x i64> %2, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
1686  %4 = mul nsw <8 x i64> %3, %1
1687  %5 = bitcast i8 %__k to <8 x i1>
1688  %6 = select <8 x i1> %5, <8 x i64> %4, <8 x i64> zeroinitializer
1689  ret <8 x i64> %6
1690}
1691
1692define <8 x i64> @test_mm512_mask_mul_epi32(i8 zeroext %__k, <8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__src) nounwind {
1693; X86-LABEL: test_mm512_mask_mul_epi32:
1694; X86:       # %bb.0: # %entry
1695; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
1696; X86-NEXT:    kmovw %eax, %k1
1697; X86-NEXT:    vpmuldq %zmm0, %zmm1, %zmm2 {%k1}
1698; X86-NEXT:    vmovdqa64 %zmm2, %zmm0
1699; X86-NEXT:    retl
1700;
1701; X64-LABEL: test_mm512_mask_mul_epi32:
1702; X64:       # %bb.0: # %entry
1703; X64-NEXT:    kmovw %edi, %k1
1704; X64-NEXT:    vpmuldq %zmm0, %zmm1, %zmm2 {%k1}
1705; X64-NEXT:    vmovdqa64 %zmm2, %zmm0
1706; X64-NEXT:    retq
1707entry:
1708  %0 = shl <8 x i64> %__A, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
1709  %1 = ashr exact <8 x i64> %0, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
1710  %2 = shl <8 x i64> %__B, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
1711  %3 = ashr exact <8 x i64> %2, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
1712  %4 = mul nsw <8 x i64> %3, %1
1713  %5 = bitcast i8 %__k to <8 x i1>
1714  %6 = select <8 x i1> %5, <8 x i64> %4, <8 x i64> %__src
1715  ret <8 x i64> %6
1716}
1717
1718define <8 x i64> @test_mm512_mul_epu32(<8 x i64> %__A, <8 x i64> %__B) nounwind {
1719; CHECK-LABEL: test_mm512_mul_epu32:
1720; CHECK:       # %bb.0:
1721; CHECK-NEXT:    movw $-21846, %ax # imm = 0xAAAA
1722; CHECK-NEXT:    kmovw %eax, %k0
1723; CHECK-NEXT:    knotw %k0, %k1
1724; CHECK-NEXT:    vmovdqa32 %zmm0, %zmm0 {%k1} {z}
1725; CHECK-NEXT:    vmovdqa32 %zmm1, %zmm1 {%k1} {z}
1726; CHECK-NEXT:    vpmuludq %zmm0, %zmm1, %zmm0
1727; CHECK-NEXT:    ret{{[l|q]}}
1728  %tmp = and <8 x i64> %__A, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
1729  %tmp1 = and <8 x i64> %__B, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
1730  %tmp2 = mul nuw <8 x i64> %tmp1, %tmp
1731  ret <8 x i64> %tmp2
1732}
1733
1734define <8 x i64> @test_mm512_maskz_mul_epu32(i8 zeroext %__k, <8 x i64> %__A, <8 x i64> %__B) nounwind {
1735; X86-LABEL: test_mm512_maskz_mul_epu32:
1736; X86:       # %bb.0: # %entry
1737; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
1738; X86-NEXT:    kmovw %eax, %k1
1739; X86-NEXT:    vpmuludq %zmm0, %zmm1, %zmm0 {%k1} {z}
1740; X86-NEXT:    retl
1741;
1742; X64-LABEL: test_mm512_maskz_mul_epu32:
1743; X64:       # %bb.0: # %entry
1744; X64-NEXT:    kmovw %edi, %k1
1745; X64-NEXT:    vpmuludq %zmm0, %zmm1, %zmm0 {%k1} {z}
1746; X64-NEXT:    retq
1747entry:
1748  %0 = and <8 x i64> %__A, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
1749  %1 = and <8 x i64> %__B, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
1750  %2 = mul nuw <8 x i64> %1, %0
1751  %3 = bitcast i8 %__k to <8 x i1>
1752  %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> zeroinitializer
1753  ret <8 x i64> %4
1754}
1755
1756define <8 x i64> @test_mm512_mask_mul_epu32(i8 zeroext %__k, <8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__src) nounwind {
1757; X86-LABEL: test_mm512_mask_mul_epu32:
1758; X86:       # %bb.0: # %entry
1759; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
1760; X86-NEXT:    kmovw %eax, %k1
1761; X86-NEXT:    vpmuludq %zmm0, %zmm1, %zmm2 {%k1}
1762; X86-NEXT:    vmovdqa64 %zmm2, %zmm0
1763; X86-NEXT:    retl
1764;
1765; X64-LABEL: test_mm512_mask_mul_epu32:
1766; X64:       # %bb.0: # %entry
1767; X64-NEXT:    kmovw %edi, %k1
1768; X64-NEXT:    vpmuludq %zmm0, %zmm1, %zmm2 {%k1}
1769; X64-NEXT:    vmovdqa64 %zmm2, %zmm0
1770; X64-NEXT:    retq
1771entry:
1772  %0 = and <8 x i64> %__A, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
1773  %1 = and <8 x i64> %__B, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
1774  %2 = mul nuw <8 x i64> %1, %0
1775  %3 = bitcast i8 %__k to <8 x i1>
1776  %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> %__src
1777  ret <8 x i64> %4
1778}
1779
1780define <8 x double> @test_mm512_set1_epi8(i8 signext %d) nounwind {
1781; X86-LABEL: test_mm512_set1_epi8:
1782; X86:       # %bb.0: # %entry
1783; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
1784; X86-NEXT:    vmovd %eax, %xmm0
1785; X86-NEXT:    vpbroadcastb %xmm0, %ymm0
1786; X86-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0
1787; X86-NEXT:    retl
1788;
1789; X64-LABEL: test_mm512_set1_epi8:
1790; X64:       # %bb.0: # %entry
1791; X64-NEXT:    vmovd %edi, %xmm0
1792; X64-NEXT:    vpbroadcastb %xmm0, %ymm0
1793; X64-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0
1794; X64-NEXT:    retq
1795entry:
1796  %vecinit.i = insertelement <64 x i8> undef, i8 %d, i32 0
1797  %vecinit63.i = shufflevector <64 x i8> %vecinit.i, <64 x i8> undef, <64 x i32> zeroinitializer
1798  %0 = bitcast <64 x i8> %vecinit63.i to <8 x double>
1799  ret <8 x double> %0
1800}
1801
1802define <2 x double> @test_mm_cvtu32_sd(<2 x double> %__A, i32 %__B) {
1803; X86-LABEL: test_mm_cvtu32_sd:
1804; X86:       # %bb.0: # %entry
1805; X86-NEXT:    vcvtusi2sdl {{[0-9]+}}(%esp), %xmm0, %xmm0
1806; X86-NEXT:    retl
1807;
1808; X64-LABEL: test_mm_cvtu32_sd:
1809; X64:       # %bb.0: # %entry
1810; X64-NEXT:    vcvtusi2sdl %edi, %xmm0, %xmm0
1811; X64-NEXT:    retq
1812entry:
1813  %conv.i = uitofp i32 %__B to double
1814  %vecins.i = insertelement <2 x double> %__A, double %conv.i, i32 0
1815  ret <2 x double> %vecins.i
1816}
1817
1818define <2 x double> @test_mm_cvtu64_sd(<2 x double> %__A, i64 %__B) {
1819; X86-LABEL: test_mm_cvtu64_sd:
1820; X86:       # %bb.0: # %entry
1821; X86-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1822; X86-NEXT:    vpinsrd $1, {{[0-9]+}}(%esp), %xmm1, %xmm1
1823; X86-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
1824; X86-NEXT:    vsubpd {{\.LCPI.*}}, %xmm1, %xmm1
1825; X86-NEXT:    vhaddpd %xmm1, %xmm1, %xmm1
1826; X86-NEXT:    vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
1827; X86-NEXT:    retl
1828;
1829; X64-LABEL: test_mm_cvtu64_sd:
1830; X64:       # %bb.0: # %entry
1831; X64-NEXT:    vcvtusi2sdq %rdi, %xmm0, %xmm0
1832; X64-NEXT:    retq
1833entry:
1834  %conv.i = uitofp i64 %__B to double
1835  %vecins.i = insertelement <2 x double> %__A, double %conv.i, i32 0
1836  ret <2 x double> %vecins.i
1837}
1838
1839define <4 x float> @test_mm_cvtu32_ss(<4 x float> %__A, i32 %__B) {
1840; X86-LABEL: test_mm_cvtu32_ss:
1841; X86:       # %bb.0: # %entry
1842; X86-NEXT:    vcvtusi2ssl {{[0-9]+}}(%esp), %xmm0, %xmm0
1843; X86-NEXT:    retl
1844;
1845; X64-LABEL: test_mm_cvtu32_ss:
1846; X64:       # %bb.0: # %entry
1847; X64-NEXT:    vcvtusi2ssl %edi, %xmm0, %xmm0
1848; X64-NEXT:    retq
1849entry:
1850  %conv.i = uitofp i32 %__B to float
1851  %vecins.i = insertelement <4 x float> %__A, float %conv.i, i32 0
1852  ret <4 x float> %vecins.i
1853}
1854
1855define <4 x float> @test_mm_cvtu64_ss(<4 x float> %__A, i64 %__B) {
1856; X86-LABEL: test_mm_cvtu64_ss:
1857; X86:       # %bb.0: # %entry
1858; X86-NEXT:    pushl %ebp
1859; X86-NEXT:    .cfi_def_cfa_offset 8
1860; X86-NEXT:    .cfi_offset %ebp, -8
1861; X86-NEXT:    movl %esp, %ebp
1862; X86-NEXT:    .cfi_def_cfa_register %ebp
1863; X86-NEXT:    andl $-8, %esp
1864; X86-NEXT:    subl $16, %esp
1865; X86-NEXT:    movl 12(%ebp), %eax
1866; X86-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1867; X86-NEXT:    vpinsrd $1, %eax, %xmm1, %xmm1
1868; X86-NEXT:    vmovq %xmm1, {{[0-9]+}}(%esp)
1869; X86-NEXT:    xorl %ecx, %ecx
1870; X86-NEXT:    testl %eax, %eax
1871; X86-NEXT:    setns %cl
1872; X86-NEXT:    fildll {{[0-9]+}}(%esp)
1873; X86-NEXT:    fadds {{\.LCPI.*}}(,%ecx,4)
1874; X86-NEXT:    fstps {{[0-9]+}}(%esp)
1875; X86-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
1876; X86-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
1877; X86-NEXT:    movl %ebp, %esp
1878; X86-NEXT:    popl %ebp
1879; X86-NEXT:    .cfi_def_cfa %esp, 4
1880; X86-NEXT:    retl
1881;
1882; X64-LABEL: test_mm_cvtu64_ss:
1883; X64:       # %bb.0: # %entry
1884; X64-NEXT:    vcvtusi2ssq %rdi, %xmm0, %xmm0
1885; X64-NEXT:    retq
1886entry:
1887  %conv.i = uitofp i64 %__B to float
1888  %vecins.i = insertelement <4 x float> %__A, float %conv.i, i32 0
1889  ret <4 x float> %vecins.i
1890}
1891
1892define <8 x double> @test_mm512_cvtps_pd(<8 x float> %__A) {
1893; CHECK-LABEL: test_mm512_cvtps_pd:
1894; CHECK:       # %bb.0: # %entry
1895; CHECK-NEXT:    vcvtps2pd %ymm0, %zmm0
1896; CHECK-NEXT:    ret{{[l|q]}}
1897entry:
1898  %conv.i = fpext <8 x float> %__A to <8 x double>
1899  ret <8 x double> %conv.i
1900}
1901
1902define <8 x double> @test_mm512_cvtpslo_pd(<16 x float> %__A) {
1903; CHECK-LABEL: test_mm512_cvtpslo_pd:
1904; CHECK:       # %bb.0: # %entry
1905; CHECK-NEXT:    vcvtps2pd %ymm0, %zmm0
1906; CHECK-NEXT:    ret{{[l|q]}}
1907entry:
1908  %shuffle.i.i = shufflevector <16 x float> %__A, <16 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1909  %conv.i.i = fpext <8 x float> %shuffle.i.i to <8 x double>
1910  ret <8 x double> %conv.i.i
1911}
1912
1913define <8 x double> @test_mm512_mask_cvtps_pd(<8 x double> %__W, i8 zeroext %__U, <8 x float> %__A) {
1914; X86-LABEL: test_mm512_mask_cvtps_pd:
1915; X86:       # %bb.0: # %entry
1916; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
1917; X86-NEXT:    kmovw %eax, %k1
1918; X86-NEXT:    vcvtps2pd %ymm1, %zmm0 {%k1}
1919; X86-NEXT:    retl
1920;
1921; X64-LABEL: test_mm512_mask_cvtps_pd:
1922; X64:       # %bb.0: # %entry
1923; X64-NEXT:    kmovw %edi, %k1
1924; X64-NEXT:    vcvtps2pd %ymm1, %zmm0 {%k1}
1925; X64-NEXT:    retq
1926entry:
1927  %conv.i.i = fpext <8 x float> %__A to <8 x double>
1928  %0 = bitcast i8 %__U to <8 x i1>
1929  %1 = select <8 x i1> %0, <8 x double> %conv.i.i, <8 x double> %__W
1930  ret <8 x double> %1
1931}
1932
1933define <8 x double> @test_mm512_mask_cvtpslo_pd(<8 x double> %__W, i8 zeroext %__U, <16 x float> %__A) {
1934; X86-LABEL: test_mm512_mask_cvtpslo_pd:
1935; X86:       # %bb.0: # %entry
1936; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
1937; X86-NEXT:    kmovw %eax, %k1
1938; X86-NEXT:    vcvtps2pd %ymm1, %zmm0 {%k1}
1939; X86-NEXT:    retl
1940;
1941; X64-LABEL: test_mm512_mask_cvtpslo_pd:
1942; X64:       # %bb.0: # %entry
1943; X64-NEXT:    kmovw %edi, %k1
1944; X64-NEXT:    vcvtps2pd %ymm1, %zmm0 {%k1}
1945; X64-NEXT:    retq
1946entry:
1947  %shuffle.i.i = shufflevector <16 x float> %__A, <16 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1948  %conv.i.i.i = fpext <8 x float> %shuffle.i.i to <8 x double>
1949  %0 = bitcast i8 %__U to <8 x i1>
1950  %1 = select <8 x i1> %0, <8 x double> %conv.i.i.i, <8 x double> %__W
1951  ret <8 x double> %1
1952}
1953
1954define <8 x double> @test_mm512_maskz_cvtps_pd(i8 zeroext %__U, <8 x float> %__A) {
1955; X86-LABEL: test_mm512_maskz_cvtps_pd:
1956; X86:       # %bb.0: # %entry
1957; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
1958; X86-NEXT:    kmovw %eax, %k1
1959; X86-NEXT:    vcvtps2pd %ymm0, %zmm0 {%k1} {z}
1960; X86-NEXT:    retl
1961;
1962; X64-LABEL: test_mm512_maskz_cvtps_pd:
1963; X64:       # %bb.0: # %entry
1964; X64-NEXT:    kmovw %edi, %k1
1965; X64-NEXT:    vcvtps2pd %ymm0, %zmm0 {%k1} {z}
1966; X64-NEXT:    retq
1967entry:
1968  %conv.i.i = fpext <8 x float> %__A to <8 x double>
1969  %0 = bitcast i8 %__U to <8 x i1>
1970  %1 = select <8 x i1> %0, <8 x double> %conv.i.i, <8 x double> zeroinitializer
1971  ret <8 x double> %1
1972}
1973
1974define <2 x i64> @test_mm512_cvtepi32_epi8(<8 x i64> %__A) {
1975; CHECK-LABEL: test_mm512_cvtepi32_epi8:
1976; CHECK:       # %bb.0: # %entry
1977; CHECK-NEXT:    vpmovdb %zmm0, %xmm0
1978; CHECK-NEXT:    vzeroupper
1979; CHECK-NEXT:    ret{{[l|q]}}
1980entry:
1981  %0 = bitcast <8 x i64> %__A to <16 x i32>
1982  %conv.i = trunc <16 x i32> %0 to <16 x i8>
1983  %1 = bitcast <16 x i8> %conv.i to <2 x i64>
1984  ret <2 x i64> %1
1985}
1986
1987define <2 x i64> @test_mm512_mask_cvtepi32_epi8(<2 x i64> %__O, i16 zeroext %__M, <8 x i64> %__A) {
1988; X86-LABEL: test_mm512_mask_cvtepi32_epi8:
1989; X86:       # %bb.0: # %entry
1990; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
1991; X86-NEXT:    vpmovdb %zmm1, %xmm0 {%k1}
1992; X86-NEXT:    vzeroupper
1993; X86-NEXT:    retl
1994;
1995; X64-LABEL: test_mm512_mask_cvtepi32_epi8:
1996; X64:       # %bb.0: # %entry
1997; X64-NEXT:    kmovw %edi, %k1
1998; X64-NEXT:    vpmovdb %zmm1, %xmm0 {%k1}
1999; X64-NEXT:    vzeroupper
2000; X64-NEXT:    retq
2001entry:
2002  %0 = bitcast <8 x i64> %__A to <16 x i32>
2003  %1 = bitcast <2 x i64> %__O to <16 x i8>
2004  %2 = tail call <16 x i8> @llvm.x86.avx512.mask.pmov.db.512(<16 x i32> %0, <16 x i8> %1, i16 %__M)
2005  %3 = bitcast <16 x i8> %2 to <2 x i64>
2006  ret <2 x i64> %3
2007}
2008
2009define <2 x i64> @test_mm512_maskz_cvtepi32_epi8(i16 zeroext %__M, <8 x i64> %__A) {
2010; X86-LABEL: test_mm512_maskz_cvtepi32_epi8:
2011; X86:       # %bb.0: # %entry
2012; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
2013; X86-NEXT:    vpmovdb %zmm0, %xmm0 {%k1} {z}
2014; X86-NEXT:    vzeroupper
2015; X86-NEXT:    retl
2016;
2017; X64-LABEL: test_mm512_maskz_cvtepi32_epi8:
2018; X64:       # %bb.0: # %entry
2019; X64-NEXT:    kmovw %edi, %k1
2020; X64-NEXT:    vpmovdb %zmm0, %xmm0 {%k1} {z}
2021; X64-NEXT:    vzeroupper
2022; X64-NEXT:    retq
2023entry:
2024  %0 = bitcast <8 x i64> %__A to <16 x i32>
2025  %1 = tail call <16 x i8> @llvm.x86.avx512.mask.pmov.db.512(<16 x i32> %0, <16 x i8> zeroinitializer, i16 %__M)
2026  %2 = bitcast <16 x i8> %1 to <2 x i64>
2027  ret <2 x i64> %2
2028}
2029
2030define <4 x i64> @test_mm512_cvtepi64_epi32(<8 x i64> %__A) {
2031; CHECK-LABEL: test_mm512_cvtepi64_epi32:
2032; CHECK:       # %bb.0: # %entry
2033; CHECK-NEXT:    vpmovqd %zmm0, %ymm0
2034; CHECK-NEXT:    ret{{[l|q]}}
2035entry:
2036  %conv.i = trunc <8 x i64> %__A to <8 x i32>
2037  %0 = bitcast <8 x i32> %conv.i to <4 x i64>
2038  ret <4 x i64> %0
2039}
2040
2041define <4 x i64> @test_mm512_mask_cvtepi64_epi32(<4 x i64> %__O, i8 zeroext %__M, <8 x i64> %__A) {
2042; X86-LABEL: test_mm512_mask_cvtepi64_epi32:
2043; X86:       # %bb.0: # %entry
2044; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
2045; X86-NEXT:    kmovw %eax, %k1
2046; X86-NEXT:    vpmovqd %zmm1, %ymm0 {%k1}
2047; X86-NEXT:    retl
2048;
2049; X64-LABEL: test_mm512_mask_cvtepi64_epi32:
2050; X64:       # %bb.0: # %entry
2051; X64-NEXT:    kmovw %edi, %k1
2052; X64-NEXT:    vpmovqd %zmm1, %ymm0 {%k1}
2053; X64-NEXT:    retq
2054entry:
2055  %conv.i.i = trunc <8 x i64> %__A to <8 x i32>
2056  %0 = bitcast <4 x i64> %__O to <8 x i32>
2057  %1 = bitcast i8 %__M to <8 x i1>
2058  %2 = select <8 x i1> %1, <8 x i32> %conv.i.i, <8 x i32> %0
2059  %3 = bitcast <8 x i32> %2 to <4 x i64>
2060  ret <4 x i64> %3
2061}
2062
2063define <4 x i64> @test_mm512_maskz_cvtepi64_epi32(i8 zeroext %__M, <8 x i64> %__A) {
2064; X86-LABEL: test_mm512_maskz_cvtepi64_epi32:
2065; X86:       # %bb.0: # %entry
2066; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
2067; X86-NEXT:    kmovw %eax, %k1
2068; X86-NEXT:    vpmovqd %zmm0, %ymm0 {%k1} {z}
2069; X86-NEXT:    retl
2070;
2071; X64-LABEL: test_mm512_maskz_cvtepi64_epi32:
2072; X64:       # %bb.0: # %entry
2073; X64-NEXT:    kmovw %edi, %k1
2074; X64-NEXT:    vpmovqd %zmm0, %ymm0 {%k1} {z}
2075; X64-NEXT:    retq
2076entry:
2077  %conv.i.i = trunc <8 x i64> %__A to <8 x i32>
2078  %0 = bitcast i8 %__M to <8 x i1>
2079  %1 = select <8 x i1> %0, <8 x i32> %conv.i.i, <8 x i32> zeroinitializer
2080  %2 = bitcast <8 x i32> %1 to <4 x i64>
2081  ret <4 x i64> %2
2082}
2083
2084define <2 x i64> @test_mm512_cvtepi64_epi16(<8 x i64> %__A) {
2085; CHECK-LABEL: test_mm512_cvtepi64_epi16:
2086; CHECK:       # %bb.0: # %entry
2087; CHECK-NEXT:    vpmovqw %zmm0, %xmm0
2088; CHECK-NEXT:    vzeroupper
2089; CHECK-NEXT:    ret{{[l|q]}}
2090entry:
2091  %conv.i = trunc <8 x i64> %__A to <8 x i16>
2092  %0 = bitcast <8 x i16> %conv.i to <2 x i64>
2093  ret <2 x i64> %0
2094}
2095
2096define <2 x i64> @test_mm512_mask_cvtepi64_epi16(<2 x i64> %__O, i8 zeroext %__M, <8 x i64> %__A) {
2097; X86-LABEL: test_mm512_mask_cvtepi64_epi16:
2098; X86:       # %bb.0: # %entry
2099; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
2100; X86-NEXT:    kmovw %eax, %k1
2101; X86-NEXT:    vpmovqw %zmm1, %xmm0 {%k1}
2102; X86-NEXT:    vzeroupper
2103; X86-NEXT:    retl
2104;
2105; X64-LABEL: test_mm512_mask_cvtepi64_epi16:
2106; X64:       # %bb.0: # %entry
2107; X64-NEXT:    kmovw %edi, %k1
2108; X64-NEXT:    vpmovqw %zmm1, %xmm0 {%k1}
2109; X64-NEXT:    vzeroupper
2110; X64-NEXT:    retq
2111entry:
2112  %0 = bitcast <2 x i64> %__O to <8 x i16>
2113  %1 = tail call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64> %__A, <8 x i16> %0, i8 %__M)
2114  %2 = bitcast <8 x i16> %1 to <2 x i64>
2115  ret <2 x i64> %2
2116}
2117
2118define <2 x i64> @test_mm512_maskz_cvtepi64_epi16(i8 zeroext %__M, <8 x i64> %__A) {
2119; X86-LABEL: test_mm512_maskz_cvtepi64_epi16:
2120; X86:       # %bb.0: # %entry
2121; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
2122; X86-NEXT:    kmovw %eax, %k1
2123; X86-NEXT:    vpmovqw %zmm0, %xmm0 {%k1} {z}
2124; X86-NEXT:    vzeroupper
2125; X86-NEXT:    retl
2126;
2127; X64-LABEL: test_mm512_maskz_cvtepi64_epi16:
2128; X64:       # %bb.0: # %entry
2129; X64-NEXT:    kmovw %edi, %k1
2130; X64-NEXT:    vpmovqw %zmm0, %xmm0 {%k1} {z}
2131; X64-NEXT:    vzeroupper
2132; X64-NEXT:    retq
2133entry:
2134  %0 = tail call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64> %__A, <8 x i16> zeroinitializer, i8 %__M)
2135  %1 = bitcast <8 x i16> %0 to <2 x i64>
2136  ret <2 x i64> %1
2137}
2138
2139declare <16 x i8> @llvm.x86.avx512.mask.pmov.db.512(<16 x i32>, <16 x i8>, i16)
2140declare <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64>, <8 x i16>, i8)
2141
2142define <8 x i64> @test_mm512_ternarylogic_epi32(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C) {
2143; CHECK-LABEL: test_mm512_ternarylogic_epi32:
2144; CHECK:       # %bb.0: # %entry
2145; CHECK-NEXT:    vpternlogd $4, %zmm2, %zmm1, %zmm0
2146; CHECK-NEXT:    ret{{[l|q]}}
2147entry:
2148  %0 = bitcast <8 x i64> %__A to <16 x i32>
2149  %1 = bitcast <8 x i64> %__B to <16 x i32>
2150  %2 = bitcast <8 x i64> %__C to <16 x i32>
2151  %3 = tail call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %0, <16 x i32> %1, <16 x i32> %2, i32 4)
2152  %4 = bitcast <16 x i32> %3 to <8 x i64>
2153  ret <8 x i64> %4
2154}
2155
2156declare <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i32) #1
2157
2158define <8 x i64> @test_mm512_mask_ternarylogic_epi32(<8 x i64> %__A, i16 zeroext %__U, <8 x i64> %__B, <8 x i64> %__C) {
2159; X86-LABEL: test_mm512_mask_ternarylogic_epi32:
2160; X86:       # %bb.0: # %entry
2161; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
2162; X86-NEXT:    vpternlogd $4, %zmm2, %zmm1, %zmm0 {%k1}
2163; X86-NEXT:    retl
2164;
2165; X64-LABEL: test_mm512_mask_ternarylogic_epi32:
2166; X64:       # %bb.0: # %entry
2167; X64-NEXT:    kmovw %edi, %k1
2168; X64-NEXT:    vpternlogd $4, %zmm2, %zmm1, %zmm0 {%k1}
2169; X64-NEXT:    retq
2170entry:
2171  %0 = bitcast <8 x i64> %__A to <16 x i32>
2172  %1 = bitcast <8 x i64> %__B to <16 x i32>
2173  %2 = bitcast <8 x i64> %__C to <16 x i32>
2174  %3 = tail call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %0, <16 x i32> %1, <16 x i32> %2, i32 4)
2175  %4 = bitcast i16 %__U to <16 x i1>
2176  %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> %0
2177  %6 = bitcast <16 x i32> %5 to <8 x i64>
2178  ret <8 x i64> %6
2179}
2180
2181define <8 x i64> @test_mm512_maskz_ternarylogic_epi32(i16 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C) {
2182; X86-LABEL: test_mm512_maskz_ternarylogic_epi32:
2183; X86:       # %bb.0: # %entry
2184; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
2185; X86-NEXT:    vpternlogd $4, %zmm2, %zmm1, %zmm0 {%k1} {z}
2186; X86-NEXT:    retl
2187;
2188; X64-LABEL: test_mm512_maskz_ternarylogic_epi32:
2189; X64:       # %bb.0: # %entry
2190; X64-NEXT:    kmovw %edi, %k1
2191; X64-NEXT:    vpternlogd $4, %zmm2, %zmm1, %zmm0 {%k1} {z}
2192; X64-NEXT:    retq
2193entry:
2194  %0 = bitcast <8 x i64> %__A to <16 x i32>
2195  %1 = bitcast <8 x i64> %__B to <16 x i32>
2196  %2 = bitcast <8 x i64> %__C to <16 x i32>
2197  %3 = tail call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %0, <16 x i32> %1, <16 x i32> %2, i32 4)
2198  %4 = bitcast i16 %__U to <16 x i1>
2199  %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> zeroinitializer
2200  %6 = bitcast <16 x i32> %5 to <8 x i64>
2201  ret <8 x i64> %6
2202}
2203
2204define <8 x i64> @test_mm512_ternarylogic_epi64(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C) {
2205; CHECK-LABEL: test_mm512_ternarylogic_epi64:
2206; CHECK:       # %bb.0: # %entry
2207; CHECK-NEXT:    vpternlogq $4, %zmm2, %zmm1, %zmm0
2208; CHECK-NEXT:    ret{{[l|q]}}
2209entry:
2210  %0 = tail call <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C, i32 4)
2211  ret <8 x i64> %0
2212}
2213
2214declare <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i32) #1
2215
2216define <8 x i64> @test_mm512_mask_ternarylogic_epi64(<8 x i64> %__A, i8 zeroext %__U, <8 x i64> %__B, <8 x i64> %__C) {
2217; X86-LABEL: test_mm512_mask_ternarylogic_epi64:
2218; X86:       # %bb.0: # %entry
2219; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
2220; X86-NEXT:    kmovw %eax, %k1
2221; X86-NEXT:    vpternlogq $4, %zmm2, %zmm1, %zmm0 {%k1}
2222; X86-NEXT:    retl
2223;
2224; X64-LABEL: test_mm512_mask_ternarylogic_epi64:
2225; X64:       # %bb.0: # %entry
2226; X64-NEXT:    kmovw %edi, %k1
2227; X64-NEXT:    vpternlogq $4, %zmm2, %zmm1, %zmm0 {%k1}
2228; X64-NEXT:    retq
2229entry:
2230  %0 = tail call <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C, i32 4)
2231  %1 = bitcast i8 %__U to <8 x i1>
2232  %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> %__A
2233  ret <8 x i64> %2
2234}
2235
2236define <8 x i64> @test_mm512_maskz_ternarylogic_epi64(i8 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C) {
2237; X86-LABEL: test_mm512_maskz_ternarylogic_epi64:
2238; X86:       # %bb.0: # %entry
2239; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
2240; X86-NEXT:    kmovw %eax, %k1
2241; X86-NEXT:    vpternlogq $4, %zmm2, %zmm1, %zmm0 {%k1} {z}
2242; X86-NEXT:    retl
2243;
2244; X64-LABEL: test_mm512_maskz_ternarylogic_epi64:
2245; X64:       # %bb.0: # %entry
2246; X64-NEXT:    kmovw %edi, %k1
2247; X64-NEXT:    vpternlogq $4, %zmm2, %zmm1, %zmm0 {%k1} {z}
2248; X64-NEXT:    retq
2249entry:
2250  %0 = tail call <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C, i32 4)
2251  %1 = bitcast i8 %__U to <8 x i1>
2252  %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> zeroinitializer
2253  ret <8 x i64> %2
2254}
2255
2256declare <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32>, <16 x i32>, <16 x i32>)
2257
2258define <8 x i64> @test_mm512_mask2_permutex2var_epi32(<8 x i64> %__A, <8 x i64> %__I, i16 zeroext %__U, <8 x i64> %__B) {
2259; X86-LABEL: test_mm512_mask2_permutex2var_epi32:
2260; X86:       # %bb.0: # %entry
2261; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
2262; X86-NEXT:    vpermi2d %zmm2, %zmm0, %zmm1 {%k1}
2263; X86-NEXT:    vmovdqa64 %zmm1, %zmm0
2264; X86-NEXT:    retl
2265;
2266; X64-LABEL: test_mm512_mask2_permutex2var_epi32:
2267; X64:       # %bb.0: # %entry
2268; X64-NEXT:    kmovw %edi, %k1
2269; X64-NEXT:    vpermi2d %zmm2, %zmm0, %zmm1 {%k1}
2270; X64-NEXT:    vmovdqa64 %zmm1, %zmm0
2271; X64-NEXT:    retq
2272entry:
2273  %0 = bitcast <8 x i64> %__A to <16 x i32>
2274  %1 = bitcast <8 x i64> %__I to <16 x i32>
2275  %2 = bitcast <8 x i64> %__B to <16 x i32>
2276  %3 = tail call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> %0, <16 x i32> %1, <16 x i32> %2)
2277  %4 = bitcast i16 %__U to <16 x i1>
2278  %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> %1
2279  %6 = bitcast <16 x i32> %5 to <8 x i64>
2280  ret <8 x i64> %6
2281}
2282
2283declare <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double>, <8 x i64>, <8 x double>)
2284
2285define <8 x double> @test_mm512_mask2_permutex2var_pd(<8 x double> %__A, <8 x i64> %__I, i8 zeroext %__U, <8 x double> %__B) {
2286; X86-LABEL: test_mm512_mask2_permutex2var_pd:
2287; X86:       # %bb.0: # %entry
2288; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
2289; X86-NEXT:    kmovw %eax, %k1
2290; X86-NEXT:    vpermi2pd %zmm2, %zmm0, %zmm1 {%k1}
2291; X86-NEXT:    vmovapd %zmm1, %zmm0
2292; X86-NEXT:    retl
2293;
2294; X64-LABEL: test_mm512_mask2_permutex2var_pd:
2295; X64:       # %bb.0: # %entry
2296; X64-NEXT:    kmovw %edi, %k1
2297; X64-NEXT:    vpermi2pd %zmm2, %zmm0, %zmm1 {%k1}
2298; X64-NEXT:    vmovapd %zmm1, %zmm0
2299; X64-NEXT:    retq
2300entry:
2301  %0 = tail call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> %__A, <8 x i64> %__I, <8 x double> %__B)
2302  %1 = bitcast <8 x i64> %__I to <8 x double>
2303  %2 = bitcast i8 %__U to <8 x i1>
2304  %3 = select <8 x i1> %2, <8 x double> %0, <8 x double> %1
2305  ret <8 x double> %3
2306}
2307
2308declare <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float>, <16 x i32>, <16 x float>)
2309
2310define <16 x float> @test_mm512_mask2_permutex2var_ps(<16 x float> %__A, <8 x i64> %__I, i16 zeroext %__U, <16 x float> %__B) {
2311; X86-LABEL: test_mm512_mask2_permutex2var_ps:
2312; X86:       # %bb.0: # %entry
2313; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
2314; X86-NEXT:    vpermi2ps %zmm2, %zmm0, %zmm1 {%k1}
2315; X86-NEXT:    vmovaps %zmm1, %zmm0
2316; X86-NEXT:    retl
2317;
2318; X64-LABEL: test_mm512_mask2_permutex2var_ps:
2319; X64:       # %bb.0: # %entry
2320; X64-NEXT:    kmovw %edi, %k1
2321; X64-NEXT:    vpermi2ps %zmm2, %zmm0, %zmm1 {%k1}
2322; X64-NEXT:    vmovaps %zmm1, %zmm0
2323; X64-NEXT:    retq
2324entry:
2325  %0 = bitcast <8 x i64> %__I to <16 x i32>
2326  %1 = tail call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> %__A, <16 x i32> %0, <16 x float> %__B)
2327  %2 = bitcast <8 x i64> %__I to <16 x float>
2328  %3 = bitcast i16 %__U to <16 x i1>
2329  %4 = select <16 x i1> %3, <16 x float> %1, <16 x float> %2
2330  ret <16 x float> %4
2331}
2332
2333declare <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64>, <8 x i64>, <8 x i64>)
2334
2335define <8 x i64> @test_mm512_mask2_permutex2var_epi64(<8 x i64> %__A, <8 x i64> %__I, i8 zeroext %__U, <8 x i64> %__B) {
2336; X86-LABEL: test_mm512_mask2_permutex2var_epi64:
2337; X86:       # %bb.0: # %entry
2338; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
2339; X86-NEXT:    kmovw %eax, %k1
2340; X86-NEXT:    vpermi2q %zmm2, %zmm0, %zmm1 {%k1}
2341; X86-NEXT:    vmovdqa64 %zmm1, %zmm0
2342; X86-NEXT:    retl
2343;
2344; X64-LABEL: test_mm512_mask2_permutex2var_epi64:
2345; X64:       # %bb.0: # %entry
2346; X64-NEXT:    kmovw %edi, %k1
2347; X64-NEXT:    vpermi2q %zmm2, %zmm0, %zmm1 {%k1}
2348; X64-NEXT:    vmovdqa64 %zmm1, %zmm0
2349; X64-NEXT:    retq
2350entry:
2351  %0 = tail call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> %__A, <8 x i64> %__I, <8 x i64> %__B)
2352  %1 = bitcast i8 %__U to <8 x i1>
2353  %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> %__I
2354  ret <8 x i64> %2
2355}
2356
2357define <8 x i64> @test_mm512_permutex2var_epi32(<8 x i64> %__A, <8 x i64> %__I, <8 x i64> %__B) {
2358; CHECK-LABEL: test_mm512_permutex2var_epi32:
2359; CHECK:       # %bb.0: # %entry
2360; CHECK-NEXT:    vpermt2d %zmm2, %zmm1, %zmm0
2361; CHECK-NEXT:    ret{{[l|q]}}
2362entry:
2363  %0 = bitcast <8 x i64> %__A to <16 x i32>
2364  %1 = bitcast <8 x i64> %__I to <16 x i32>
2365  %2 = bitcast <8 x i64> %__B to <16 x i32>
2366  %3 = tail call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> %0, <16 x i32> %1, <16 x i32> %2)
2367  %4 = bitcast <16 x i32> %3 to <8 x i64>
2368  ret <8 x i64> %4
2369}
2370
2371define <8 x i64> @test_mm512_maskz_permutex2var_epi32(i16 zeroext %__U, <8 x i64> %__A, <8 x i64> %__I, <8 x i64> %__B) {
2372; X86-LABEL: test_mm512_maskz_permutex2var_epi32:
2373; X86:       # %bb.0: # %entry
2374; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
2375; X86-NEXT:    vpermt2d %zmm2, %zmm1, %zmm0 {%k1} {z}
2376; X86-NEXT:    retl
2377;
2378; X64-LABEL: test_mm512_maskz_permutex2var_epi32:
2379; X64:       # %bb.0: # %entry
2380; X64-NEXT:    kmovw %edi, %k1
2381; X64-NEXT:    vpermt2d %zmm2, %zmm1, %zmm0 {%k1} {z}
2382; X64-NEXT:    retq
2383entry:
2384  %0 = bitcast <8 x i64> %__A to <16 x i32>
2385  %1 = bitcast <8 x i64> %__I to <16 x i32>
2386  %2 = bitcast <8 x i64> %__B to <16 x i32>
2387  %3 = tail call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> %0, <16 x i32> %1, <16 x i32> %2)
2388  %4 = bitcast i16 %__U to <16 x i1>
2389  %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> zeroinitializer
2390  %6 = bitcast <16 x i32> %5 to <8 x i64>
2391  ret <8 x i64> %6
2392}
2393
2394define <8 x i64> @test_mm512_mask_permutex2var_epi32(<8 x i64> %__A, i16 zeroext %__U, <8 x i64> %__I, <8 x i64> %__B) {
2395; X86-LABEL: test_mm512_mask_permutex2var_epi32:
2396; X86:       # %bb.0: # %entry
2397; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
2398; X86-NEXT:    vpermt2d %zmm2, %zmm1, %zmm0 {%k1}
2399; X86-NEXT:    retl
2400;
2401; X64-LABEL: test_mm512_mask_permutex2var_epi32:
2402; X64:       # %bb.0: # %entry
2403; X64-NEXT:    kmovw %edi, %k1
2404; X64-NEXT:    vpermt2d %zmm2, %zmm1, %zmm0 {%k1}
2405; X64-NEXT:    retq
2406entry:
2407  %0 = bitcast <8 x i64> %__A to <16 x i32>
2408  %1 = bitcast <8 x i64> %__I to <16 x i32>
2409  %2 = bitcast <8 x i64> %__B to <16 x i32>
2410  %3 = tail call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> %0, <16 x i32> %1, <16 x i32> %2)
2411  %4 = bitcast i16 %__U to <16 x i1>
2412  %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> %0
2413  %6 = bitcast <16 x i32> %5 to <8 x i64>
2414  ret <8 x i64> %6
2415}
2416
2417define <8 x double> @test_mm512_permutex2var_pd(<8 x double> %__A, <8 x i64> %__I, <8 x double> %__B) {
2418; CHECK-LABEL: test_mm512_permutex2var_pd:
2419; CHECK:       # %bb.0: # %entry
2420; CHECK-NEXT:    vpermt2pd %zmm2, %zmm1, %zmm0
2421; CHECK-NEXT:    ret{{[l|q]}}
2422entry:
2423  %0 = tail call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> %__A, <8 x i64> %__I, <8 x double> %__B)
2424  ret <8 x double> %0
2425}
2426
2427define <8 x double> @test_mm512_mask_permutex2var_pd(<8 x double> %__A, i8 zeroext %__U, <8 x i64> %__I, <8 x double> %__B) {
2428; X86-LABEL: test_mm512_mask_permutex2var_pd:
2429; X86:       # %bb.0: # %entry
2430; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
2431; X86-NEXT:    kmovw %eax, %k1
2432; X86-NEXT:    vpermt2pd %zmm2, %zmm1, %zmm0 {%k1}
2433; X86-NEXT:    retl
2434;
2435; X64-LABEL: test_mm512_mask_permutex2var_pd:
2436; X64:       # %bb.0: # %entry
2437; X64-NEXT:    kmovw %edi, %k1
2438; X64-NEXT:    vpermt2pd %zmm2, %zmm1, %zmm0 {%k1}
2439; X64-NEXT:    retq
2440entry:
2441  %0 = tail call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> %__A, <8 x i64> %__I, <8 x double> %__B)
2442  %1 = bitcast i8 %__U to <8 x i1>
2443  %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__A
2444  ret <8 x double> %2
2445}
2446
2447define <8 x double> @test_mm512_maskz_permutex2var_pd(i8 zeroext %__U, <8 x double> %__A, <8 x i64> %__I, <8 x double> %__B) {
2448; X86-LABEL: test_mm512_maskz_permutex2var_pd:
2449; X86:       # %bb.0: # %entry
2450; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
2451; X86-NEXT:    kmovw %eax, %k1
2452; X86-NEXT:    vpermt2pd %zmm2, %zmm1, %zmm0 {%k1} {z}
2453; X86-NEXT:    retl
2454;
2455; X64-LABEL: test_mm512_maskz_permutex2var_pd:
2456; X64:       # %bb.0: # %entry
2457; X64-NEXT:    kmovw %edi, %k1
2458; X64-NEXT:    vpermt2pd %zmm2, %zmm1, %zmm0 {%k1} {z}
2459; X64-NEXT:    retq
2460entry:
2461  %0 = tail call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> %__A, <8 x i64> %__I, <8 x double> %__B)
2462  %1 = bitcast i8 %__U to <8 x i1>
2463  %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer
2464  ret <8 x double> %2
2465}
2466
2467define <16 x float> @test_mm512_permutex2var_ps(<16 x float> %__A, <8 x i64> %__I, <16 x float> %__B) {
2468; CHECK-LABEL: test_mm512_permutex2var_ps:
2469; CHECK:       # %bb.0: # %entry
2470; CHECK-NEXT:    vpermt2ps %zmm2, %zmm1, %zmm0
2471; CHECK-NEXT:    ret{{[l|q]}}
2472entry:
2473  %0 = bitcast <8 x i64> %__I to <16 x i32>
2474  %1 = tail call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> %__A, <16 x i32> %0, <16 x float> %__B)
2475  ret <16 x float> %1
2476}
2477
2478define <16 x float> @test_mm512_mask_permutex2var_ps(<16 x float> %__A, i16 zeroext %__U, <8 x i64> %__I, <16 x float> %__B) {
2479; X86-LABEL: test_mm512_mask_permutex2var_ps:
2480; X86:       # %bb.0: # %entry
2481; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
2482; X86-NEXT:    vpermt2ps %zmm2, %zmm1, %zmm0 {%k1}
2483; X86-NEXT:    retl
2484;
2485; X64-LABEL: test_mm512_mask_permutex2var_ps:
2486; X64:       # %bb.0: # %entry
2487; X64-NEXT:    kmovw %edi, %k1
2488; X64-NEXT:    vpermt2ps %zmm2, %zmm1, %zmm0 {%k1}
2489; X64-NEXT:    retq
2490entry:
2491  %0 = bitcast <8 x i64> %__I to <16 x i32>
2492  %1 = tail call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> %__A, <16 x i32> %0, <16 x float> %__B)
2493  %2 = bitcast i16 %__U to <16 x i1>
2494  %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %__A
2495  ret <16 x float> %3
2496}
2497
2498define <16 x float> @test_mm512_maskz_permutex2var_ps(i16 zeroext %__U, <16 x float> %__A, <8 x i64> %__I, <16 x float> %__B) {
2499; X86-LABEL: test_mm512_maskz_permutex2var_ps:
2500; X86:       # %bb.0: # %entry
2501; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
2502; X86-NEXT:    vpermt2ps %zmm2, %zmm1, %zmm0 {%k1} {z}
2503; X86-NEXT:    retl
2504;
2505; X64-LABEL: test_mm512_maskz_permutex2var_ps:
2506; X64:       # %bb.0: # %entry
2507; X64-NEXT:    kmovw %edi, %k1
2508; X64-NEXT:    vpermt2ps %zmm2, %zmm1, %zmm0 {%k1} {z}
2509; X64-NEXT:    retq
2510entry:
2511  %0 = bitcast <8 x i64> %__I to <16 x i32>
2512  %1 = tail call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> %__A, <16 x i32> %0, <16 x float> %__B)
2513  %2 = bitcast i16 %__U to <16 x i1>
2514  %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
2515  ret <16 x float> %3
2516}
2517
2518define <8 x i64> @test_mm512_permutex2var_epi64(<8 x i64> %__A, <8 x i64> %__I, <8 x i64> %__B) {
2519; CHECK-LABEL: test_mm512_permutex2var_epi64:
2520; CHECK:       # %bb.0: # %entry
2521; CHECK-NEXT:    vpermt2q %zmm2, %zmm1, %zmm0
2522; CHECK-NEXT:    ret{{[l|q]}}
2523entry:
2524  %0 = tail call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> %__A, <8 x i64> %__I, <8 x i64> %__B)
2525  ret <8 x i64> %0
2526}
2527
2528define <8 x i64> @test_mm512_mask_permutex2var_epi64(<8 x i64> %__A, i8 zeroext %__U, <8 x i64> %__I, <8 x i64> %__B) {
2529; X86-LABEL: test_mm512_mask_permutex2var_epi64:
2530; X86:       # %bb.0: # %entry
2531; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
2532; X86-NEXT:    kmovw %eax, %k1
2533; X86-NEXT:    vpermt2q %zmm2, %zmm1, %zmm0 {%k1}
2534; X86-NEXT:    retl
2535;
2536; X64-LABEL: test_mm512_mask_permutex2var_epi64:
2537; X64:       # %bb.0: # %entry
2538; X64-NEXT:    kmovw %edi, %k1
2539; X64-NEXT:    vpermt2q %zmm2, %zmm1, %zmm0 {%k1}
2540; X64-NEXT:    retq
2541entry:
2542  %0 = tail call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> %__A, <8 x i64> %__I, <8 x i64> %__B)
2543  %1 = bitcast i8 %__U to <8 x i1>
2544  %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> %__A
2545  ret <8 x i64> %2
2546}
2547
2548define <8 x i64> @test_mm512_maskz_permutex2var_epi64(i8 zeroext %__U, <8 x i64> %__A, <8 x i64> %__I, <8 x i64> %__B) {
2549; X86-LABEL: test_mm512_maskz_permutex2var_epi64:
2550; X86:       # %bb.0: # %entry
2551; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
2552; X86-NEXT:    kmovw %eax, %k1
2553; X86-NEXT:    vpermt2q %zmm2, %zmm1, %zmm0 {%k1} {z}
2554; X86-NEXT:    retl
2555;
2556; X64-LABEL: test_mm512_maskz_permutex2var_epi64:
2557; X64:       # %bb.0: # %entry
2558; X64-NEXT:    kmovw %edi, %k1
2559; X64-NEXT:    vpermt2q %zmm2, %zmm1, %zmm0 {%k1} {z}
2560; X64-NEXT:    retq
2561entry:
2562  %0 = tail call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> %__A, <8 x i64> %__I, <8 x i64> %__B)
2563  %1 = bitcast i8 %__U to <8 x i1>
2564  %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> zeroinitializer
2565  ret <8 x i64> %2
2566}
2567define <4 x float> @test_mm_mask_add_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
2568; X86-LABEL: test_mm_mask_add_ss:
2569; X86:       # %bb.0: # %entry
2570; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
2571; X86-NEXT:    kmovw %eax, %k1
2572; X86-NEXT:    vaddss %xmm2, %xmm1, %xmm0 {%k1}
2573; X86-NEXT:    retl
2574;
2575; X64-LABEL: test_mm_mask_add_ss:
2576; X64:       # %bb.0: # %entry
2577; X64-NEXT:    kmovw %edi, %k1
2578; X64-NEXT:    vaddss %xmm2, %xmm1, %xmm0 {%k1}
2579; X64-NEXT:    retq
2580entry:
2581  %vecext.i.i = extractelement <4 x float> %__B, i32 0
2582  %vecext1.i.i = extractelement <4 x float> %__A, i32 0
2583  %add.i.i = fadd float %vecext1.i.i, %vecext.i.i
2584  %0 = and i8 %__U, 1
2585  %tobool.i = icmp eq i8 %0, 0
2586  %vecext1.i = extractelement <4 x float> %__W, i32 0
2587  %cond.i = select i1 %tobool.i, float %vecext1.i, float %add.i.i
2588  %vecins.i = insertelement <4 x float> %__A, float %cond.i, i32 0
2589  ret <4 x float> %vecins.i
2590}
2591
2592define <4 x float> @test_mm_maskz_add_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
2593; X86-LABEL: test_mm_maskz_add_ss:
2594; X86:       # %bb.0: # %entry
2595; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
2596; X86-NEXT:    kmovw %eax, %k1
2597; X86-NEXT:    vaddss %xmm1, %xmm0, %xmm0 {%k1} {z}
2598; X86-NEXT:    retl
2599;
2600; X64-LABEL: test_mm_maskz_add_ss:
2601; X64:       # %bb.0: # %entry
2602; X64-NEXT:    kmovw %edi, %k1
2603; X64-NEXT:    vaddss %xmm1, %xmm0, %xmm0 {%k1} {z}
2604; X64-NEXT:    retq
2605entry:
2606  %vecext.i.i = extractelement <4 x float> %__B, i32 0
2607  %vecext1.i.i = extractelement <4 x float> %__A, i32 0
2608  %add.i.i = fadd float %vecext1.i.i, %vecext.i.i
2609  %0 = and i8 %__U, 1
2610  %tobool.i = icmp eq i8 %0, 0
2611  %cond.i = select i1 %tobool.i, float 0.000000e+00, float %add.i.i
2612  %vecins.i = insertelement <4 x float> %__A, float %cond.i, i32 0
2613  ret <4 x float> %vecins.i
2614}
2615
2616define <2 x double> @test_mm_mask_add_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
2617; X86-LABEL: test_mm_mask_add_sd:
2618; X86:       # %bb.0: # %entry
2619; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
2620; X86-NEXT:    kmovw %eax, %k1
2621; X86-NEXT:    vaddsd %xmm2, %xmm1, %xmm0 {%k1}
2622; X86-NEXT:    retl
2623;
2624; X64-LABEL: test_mm_mask_add_sd:
2625; X64:       # %bb.0: # %entry
2626; X64-NEXT:    kmovw %edi, %k1
2627; X64-NEXT:    vaddsd %xmm2, %xmm1, %xmm0 {%k1}
2628; X64-NEXT:    retq
2629entry:
2630  %vecext.i.i = extractelement <2 x double> %__B, i32 0
2631  %vecext1.i.i = extractelement <2 x double> %__A, i32 0
2632  %add.i.i = fadd double %vecext1.i.i, %vecext.i.i
2633  %0 = and i8 %__U, 1
2634  %tobool.i = icmp eq i8 %0, 0
2635  %vecext1.i = extractelement <2 x double> %__W, i32 0
2636  %cond.i = select i1 %tobool.i, double %vecext1.i, double %add.i.i
2637  %vecins.i = insertelement <2 x double> %__A, double %cond.i, i32 0
2638  ret <2 x double> %vecins.i
2639}
2640
2641define <2 x double> @test_mm_maskz_add_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
2642; X86-LABEL: test_mm_maskz_add_sd:
2643; X86:       # %bb.0: # %entry
2644; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
2645; X86-NEXT:    kmovw %eax, %k1
2646; X86-NEXT:    vaddsd %xmm1, %xmm0, %xmm0 {%k1} {z}
2647; X86-NEXT:    retl
2648;
2649; X64-LABEL: test_mm_maskz_add_sd:
2650; X64:       # %bb.0: # %entry
2651; X64-NEXT:    kmovw %edi, %k1
2652; X64-NEXT:    vaddsd %xmm1, %xmm0, %xmm0 {%k1} {z}
2653; X64-NEXT:    retq
2654entry:
2655  %vecext.i.i = extractelement <2 x double> %__B, i32 0
2656  %vecext1.i.i = extractelement <2 x double> %__A, i32 0
2657  %add.i.i = fadd double %vecext1.i.i, %vecext.i.i
2658  %0 = and i8 %__U, 1
2659  %tobool.i = icmp eq i8 %0, 0
2660  %cond.i = select i1 %tobool.i, double 0.000000e+00, double %add.i.i
2661  %vecins.i = insertelement <2 x double> %__A, double %cond.i, i32 0
2662  ret <2 x double> %vecins.i
2663}
2664
2665define <4 x float> @test_mm_mask_sub_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
2666; X86-LABEL: test_mm_mask_sub_ss:
2667; X86:       # %bb.0: # %entry
2668; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
2669; X86-NEXT:    kmovw %eax, %k1
2670; X86-NEXT:    vsubss %xmm2, %xmm1, %xmm0 {%k1}
2671; X86-NEXT:    retl
2672;
2673; X64-LABEL: test_mm_mask_sub_ss:
2674; X64:       # %bb.0: # %entry
2675; X64-NEXT:    kmovw %edi, %k1
2676; X64-NEXT:    vsubss %xmm2, %xmm1, %xmm0 {%k1}
2677; X64-NEXT:    retq
2678entry:
2679  %vecext.i.i = extractelement <4 x float> %__B, i32 0
2680  %vecext1.i.i = extractelement <4 x float> %__A, i32 0
2681  %sub.i.i = fsub float %vecext1.i.i, %vecext.i.i
2682  %0 = and i8 %__U, 1
2683  %tobool.i = icmp eq i8 %0, 0
2684  %vecext1.i = extractelement <4 x float> %__W, i32 0
2685  %cond.i = select i1 %tobool.i, float %vecext1.i, float %sub.i.i
2686  %vecins.i = insertelement <4 x float> %__A, float %cond.i, i32 0
2687  ret <4 x float> %vecins.i
2688}
2689
2690define <4 x float> @test_mm_maskz_sub_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
2691; X86-LABEL: test_mm_maskz_sub_ss:
2692; X86:       # %bb.0: # %entry
2693; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
2694; X86-NEXT:    kmovw %eax, %k1
2695; X86-NEXT:    vsubss %xmm1, %xmm0, %xmm0 {%k1} {z}
2696; X86-NEXT:    retl
2697;
2698; X64-LABEL: test_mm_maskz_sub_ss:
2699; X64:       # %bb.0: # %entry
2700; X64-NEXT:    kmovw %edi, %k1
2701; X64-NEXT:    vsubss %xmm1, %xmm0, %xmm0 {%k1} {z}
2702; X64-NEXT:    retq
2703entry:
2704  %vecext.i.i = extractelement <4 x float> %__B, i32 0
2705  %vecext1.i.i = extractelement <4 x float> %__A, i32 0
2706  %sub.i.i = fsub float %vecext1.i.i, %vecext.i.i
2707  %0 = and i8 %__U, 1
2708  %tobool.i = icmp eq i8 %0, 0
2709  %cond.i = select i1 %tobool.i, float 0.000000e+00, float %sub.i.i
2710  %vecins.i = insertelement <4 x float> %__A, float %cond.i, i32 0
2711  ret <4 x float> %vecins.i
2712}
2713
2714define <2 x double> @test_mm_mask_sub_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
2715; X86-LABEL: test_mm_mask_sub_sd:
2716; X86:       # %bb.0: # %entry
2717; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
2718; X86-NEXT:    kmovw %eax, %k1
2719; X86-NEXT:    vsubsd %xmm2, %xmm1, %xmm0 {%k1}
2720; X86-NEXT:    retl
2721;
2722; X64-LABEL: test_mm_mask_sub_sd:
2723; X64:       # %bb.0: # %entry
2724; X64-NEXT:    kmovw %edi, %k1
2725; X64-NEXT:    vsubsd %xmm2, %xmm1, %xmm0 {%k1}
2726; X64-NEXT:    retq
2727entry:
2728  %vecext.i.i = extractelement <2 x double> %__B, i32 0
2729  %vecext1.i.i = extractelement <2 x double> %__A, i32 0
2730  %sub.i.i = fsub double %vecext1.i.i, %vecext.i.i
2731  %0 = and i8 %__U, 1
2732  %tobool.i = icmp eq i8 %0, 0
2733  %vecext1.i = extractelement <2 x double> %__W, i32 0
2734  %cond.i = select i1 %tobool.i, double %vecext1.i, double %sub.i.i
2735  %vecins.i = insertelement <2 x double> %__A, double %cond.i, i32 0
2736  ret <2 x double> %vecins.i
2737}
2738
2739define <2 x double> @test_mm_maskz_sub_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
2740; X86-LABEL: test_mm_maskz_sub_sd:
2741; X86:       # %bb.0: # %entry
2742; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
2743; X86-NEXT:    kmovw %eax, %k1
2744; X86-NEXT:    vsubsd %xmm1, %xmm0, %xmm0 {%k1} {z}
2745; X86-NEXT:    retl
2746;
2747; X64-LABEL: test_mm_maskz_sub_sd:
2748; X64:       # %bb.0: # %entry
2749; X64-NEXT:    kmovw %edi, %k1
2750; X64-NEXT:    vsubsd %xmm1, %xmm0, %xmm0 {%k1} {z}
2751; X64-NEXT:    retq
2752entry:
2753  %vecext.i.i = extractelement <2 x double> %__B, i32 0
2754  %vecext1.i.i = extractelement <2 x double> %__A, i32 0
2755  %sub.i.i = fsub double %vecext1.i.i, %vecext.i.i
2756  %0 = and i8 %__U, 1
2757  %tobool.i = icmp eq i8 %0, 0
2758  %cond.i = select i1 %tobool.i, double 0.000000e+00, double %sub.i.i
2759  %vecins.i = insertelement <2 x double> %__A, double %cond.i, i32 0
2760  ret <2 x double> %vecins.i
2761}
2762
2763define <4 x float> @test_mm_mask_mul_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
2764; X86-LABEL: test_mm_mask_mul_ss:
2765; X86:       # %bb.0: # %entry
2766; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
2767; X86-NEXT:    kmovw %eax, %k1
2768; X86-NEXT:    vmulss %xmm2, %xmm1, %xmm0 {%k1}
2769; X86-NEXT:    retl
2770;
2771; X64-LABEL: test_mm_mask_mul_ss:
2772; X64:       # %bb.0: # %entry
2773; X64-NEXT:    kmovw %edi, %k1
2774; X64-NEXT:    vmulss %xmm2, %xmm1, %xmm0 {%k1}
2775; X64-NEXT:    retq
2776entry:
2777  %vecext.i.i = extractelement <4 x float> %__B, i32 0
2778  %vecext1.i.i = extractelement <4 x float> %__A, i32 0
2779  %mul.i.i = fmul float %vecext1.i.i, %vecext.i.i
2780  %0 = and i8 %__U, 1
2781  %tobool.i = icmp eq i8 %0, 0
2782  %vecext1.i = extractelement <4 x float> %__W, i32 0
2783  %cond.i = select i1 %tobool.i, float %vecext1.i, float %mul.i.i
2784  %vecins.i = insertelement <4 x float> %__A, float %cond.i, i32 0
2785  ret <4 x float> %vecins.i
2786}
2787
2788define <4 x float> @test_mm_maskz_mul_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
2789; X86-LABEL: test_mm_maskz_mul_ss:
2790; X86:       # %bb.0: # %entry
2791; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
2792; X86-NEXT:    kmovw %eax, %k1
2793; X86-NEXT:    vmulss %xmm1, %xmm0, %xmm0 {%k1} {z}
2794; X86-NEXT:    retl
2795;
2796; X64-LABEL: test_mm_maskz_mul_ss:
2797; X64:       # %bb.0: # %entry
2798; X64-NEXT:    kmovw %edi, %k1
2799; X64-NEXT:    vmulss %xmm1, %xmm0, %xmm0 {%k1} {z}
2800; X64-NEXT:    retq
2801entry:
2802  %vecext.i.i = extractelement <4 x float> %__B, i32 0
2803  %vecext1.i.i = extractelement <4 x float> %__A, i32 0
2804  %mul.i.i = fmul float %vecext1.i.i, %vecext.i.i
2805  %0 = and i8 %__U, 1
2806  %tobool.i = icmp eq i8 %0, 0
2807  %cond.i = select i1 %tobool.i, float 0.000000e+00, float %mul.i.i
2808  %vecins.i = insertelement <4 x float> %__A, float %cond.i, i32 0
2809  ret <4 x float> %vecins.i
2810}
2811
2812define <2 x double> @test_mm_mask_mul_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
2813; X86-LABEL: test_mm_mask_mul_sd:
2814; X86:       # %bb.0: # %entry
2815; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
2816; X86-NEXT:    kmovw %eax, %k1
2817; X86-NEXT:    vmulsd %xmm2, %xmm1, %xmm0 {%k1}
2818; X86-NEXT:    retl
2819;
2820; X64-LABEL: test_mm_mask_mul_sd:
2821; X64:       # %bb.0: # %entry
2822; X64-NEXT:    kmovw %edi, %k1
2823; X64-NEXT:    vmulsd %xmm2, %xmm1, %xmm0 {%k1}
2824; X64-NEXT:    retq
2825entry:
2826  %vecext.i.i = extractelement <2 x double> %__B, i32 0
2827  %vecext1.i.i = extractelement <2 x double> %__A, i32 0
2828  %mul.i.i = fmul double %vecext1.i.i, %vecext.i.i
2829  %0 = and i8 %__U, 1
2830  %tobool.i = icmp eq i8 %0, 0
2831  %vecext1.i = extractelement <2 x double> %__W, i32 0
2832  %cond.i = select i1 %tobool.i, double %vecext1.i, double %mul.i.i
2833  %vecins.i = insertelement <2 x double> %__A, double %cond.i, i32 0
2834  ret <2 x double> %vecins.i
2835}
2836
2837define <2 x double> @test_mm_maskz_mul_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
2838; X86-LABEL: test_mm_maskz_mul_sd:
2839; X86:       # %bb.0: # %entry
2840; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
2841; X86-NEXT:    kmovw %eax, %k1
2842; X86-NEXT:    vmulsd %xmm1, %xmm0, %xmm0 {%k1} {z}
2843; X86-NEXT:    retl
2844;
2845; X64-LABEL: test_mm_maskz_mul_sd:
2846; X64:       # %bb.0: # %entry
2847; X64-NEXT:    kmovw %edi, %k1
2848; X64-NEXT:    vmulsd %xmm1, %xmm0, %xmm0 {%k1} {z}
2849; X64-NEXT:    retq
2850entry:
2851  %vecext.i.i = extractelement <2 x double> %__B, i32 0
2852  %vecext1.i.i = extractelement <2 x double> %__A, i32 0
2853  %mul.i.i = fmul double %vecext1.i.i, %vecext.i.i
2854  %0 = and i8 %__U, 1
2855  %tobool.i = icmp eq i8 %0, 0
2856  %cond.i = select i1 %tobool.i, double 0.000000e+00, double %mul.i.i
2857  %vecins.i = insertelement <2 x double> %__A, double %cond.i, i32 0
2858  ret <2 x double> %vecins.i
2859}
2860
2861define <4 x float> @test_mm_mask_div_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
2862; X86-LABEL: test_mm_mask_div_ss:
2863; X86:       # %bb.0: # %entry
2864; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
2865; X86-NEXT:    kmovw %eax, %k1
2866; X86-NEXT:    vdivss %xmm2, %xmm1, %xmm0 {%k1}
2867; X86-NEXT:    retl
2868;
2869; X64-LABEL: test_mm_mask_div_ss:
2870; X64:       # %bb.0: # %entry
2871; X64-NEXT:    kmovw %edi, %k1
2872; X64-NEXT:    vdivss %xmm2, %xmm1, %xmm0 {%k1}
2873; X64-NEXT:    retq
2874entry:
2875  %0 = extractelement <4 x float> %__A, i64 0
2876  %1 = extractelement <4 x float> %__B, i64 0
2877  %2 = extractelement <4 x float> %__W, i64 0
2878  %3 = fdiv float %0, %1
2879  %4 = bitcast i8 %__U to <8 x i1>
2880  %5 = extractelement <8 x i1> %4, i64 0
2881  %6 = select i1 %5, float %3, float %2
2882  %7 = insertelement <4 x float> %__A, float %6, i64 0
2883  ret <4 x float> %7
2884}
2885
2886define <4 x float> @test_mm_maskz_div_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
2887; X86-LABEL: test_mm_maskz_div_ss:
2888; X86:       # %bb.0: # %entry
2889; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
2890; X86-NEXT:    kmovw %eax, %k1
2891; X86-NEXT:    vdivss %xmm1, %xmm0, %xmm0 {%k1} {z}
2892; X86-NEXT:    retl
2893;
2894; X64-LABEL: test_mm_maskz_div_ss:
2895; X64:       # %bb.0: # %entry
2896; X64-NEXT:    kmovw %edi, %k1
2897; X64-NEXT:    vdivss %xmm1, %xmm0, %xmm0 {%k1} {z}
2898; X64-NEXT:    retq
2899entry:
2900  %0 = extractelement <4 x float> %__A, i64 0
2901  %1 = extractelement <4 x float> %__B, i64 0
2902  %2 = fdiv float %0, %1
2903  %3 = bitcast i8 %__U to <8 x i1>
2904  %4 = extractelement <8 x i1> %3, i64 0
2905  %5 = select i1 %4, float %2, float 0.000000e+00
2906  %6 = insertelement <4 x float> %__A, float %5, i64 0
2907  ret <4 x float> %6
2908}
2909
2910define <2 x double> @test_mm_mask_div_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
2911; X86-LABEL: test_mm_mask_div_sd:
2912; X86:       # %bb.0: # %entry
2913; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
2914; X86-NEXT:    kmovw %eax, %k1
2915; X86-NEXT:    vdivsd %xmm2, %xmm1, %xmm0 {%k1}
2916; X86-NEXT:    retl
2917;
2918; X64-LABEL: test_mm_mask_div_sd:
2919; X64:       # %bb.0: # %entry
2920; X64-NEXT:    kmovw %edi, %k1
2921; X64-NEXT:    vdivsd %xmm2, %xmm1, %xmm0 {%k1}
2922; X64-NEXT:    retq
2923entry:
2924  %0 = extractelement <2 x double> %__A, i64 0
2925  %1 = extractelement <2 x double> %__B, i64 0
2926  %2 = extractelement <2 x double> %__W, i64 0
2927  %3 = fdiv double %0, %1
2928  %4 = bitcast i8 %__U to <8 x i1>
2929  %5 = extractelement <8 x i1> %4, i64 0
2930  %6 = select i1 %5, double %3, double %2
2931  %7 = insertelement <2 x double> %__A, double %6, i64 0
2932  ret <2 x double> %7
2933}
2934
2935define <2 x double> @test_mm_maskz_div_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
2936; X86-LABEL: test_mm_maskz_div_sd:
2937; X86:       # %bb.0: # %entry
2938; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
2939; X86-NEXT:    kmovw %eax, %k1
2940; X86-NEXT:    vdivsd %xmm1, %xmm0, %xmm0 {%k1} {z}
2941; X86-NEXT:    retl
2942;
2943; X64-LABEL: test_mm_maskz_div_sd:
2944; X64:       # %bb.0: # %entry
2945; X64-NEXT:    kmovw %edi, %k1
2946; X64-NEXT:    vdivsd %xmm1, %xmm0, %xmm0 {%k1} {z}
2947; X64-NEXT:    retq
2948entry:
2949  %0 = extractelement <2 x double> %__A, i64 0
2950  %1 = extractelement <2 x double> %__B, i64 0
2951  %2 = fdiv double %0, %1
2952  %3 = bitcast i8 %__U to <8 x i1>
2953  %4 = extractelement <8 x i1> %3, i64 0
2954  %5 = select i1 %4, double %2, double 0.000000e+00
2955  %6 = insertelement <2 x double> %__A, double %5, i64 0
2956  ret <2 x double> %6
2957}
2958
2959
2960define <8 x double> @test_mm512_fmadd_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
2961; CHECK-LABEL: test_mm512_fmadd_round_pd:
2962; CHECK:       # %bb.0: # %entry
2963; CHECK-NEXT:    vfmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0
2964; CHECK-NEXT:    ret{{[l|q]}}
2965entry:
2966  %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i32 8)
2967  ret <8 x double> %0
2968}
2969
2970declare <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double>, <8 x double>, <8 x double>, i32) #1
2971
2972define <8 x double> @test_mm512_mask_fmadd_round_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) {
2973; X86-LABEL: test_mm512_mask_fmadd_round_pd:
2974; X86:       # %bb.0: # %entry
2975; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
2976; X86-NEXT:    kmovw %eax, %k1
2977; X86-NEXT:    vfmadd132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
2978; X86-NEXT:    retl
2979;
2980; X64-LABEL: test_mm512_mask_fmadd_round_pd:
2981; X64:       # %bb.0: # %entry
2982; X64-NEXT:    kmovw %edi, %k1
2983; X64-NEXT:    vfmadd132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
2984; X64-NEXT:    retq
2985entry:
2986  %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i32 8)
2987  %1 = bitcast i8 %__U to <8 x i1>
2988  %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__A
2989  ret <8 x double> %2
2990}
2991
2992define <8 x double> @test_mm512_mask3_fmadd_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) {
2993; X86-LABEL: test_mm512_mask3_fmadd_round_pd:
2994; X86:       # %bb.0: # %entry
2995; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
2996; X86-NEXT:    kmovw %eax, %k1
2997; X86-NEXT:    vfmadd231pd {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
2998; X86-NEXT:    vmovapd %zmm2, %zmm0
2999; X86-NEXT:    retl
3000;
3001; X64-LABEL: test_mm512_mask3_fmadd_round_pd:
3002; X64:       # %bb.0: # %entry
3003; X64-NEXT:    kmovw %edi, %k1
3004; X64-NEXT:    vfmadd231pd {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
3005; X64-NEXT:    vmovapd %zmm2, %zmm0
3006; X64-NEXT:    retq
3007entry:
3008  %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i32 8)
3009  %1 = bitcast i8 %__U to <8 x i1>
3010  %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__C
3011  ret <8 x double> %2
3012}
3013
3014define <8 x double> @test_mm512_maskz_fmadd_round_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
3015; X86-LABEL: test_mm512_maskz_fmadd_round_pd:
3016; X86:       # %bb.0: # %entry
3017; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
3018; X86-NEXT:    kmovw %eax, %k1
3019; X86-NEXT:    vfmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
3020; X86-NEXT:    retl
3021;
3022; X64-LABEL: test_mm512_maskz_fmadd_round_pd:
3023; X64:       # %bb.0: # %entry
3024; X64-NEXT:    kmovw %edi, %k1
3025; X64-NEXT:    vfmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
3026; X64-NEXT:    retq
3027entry:
3028  %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i32 8)
3029  %1 = bitcast i8 %__U to <8 x i1>
3030  %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer
3031  ret <8 x double> %2
3032}
3033
3034define <8 x double> @test_mm512_fmsub_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
3035; X86-LABEL: test_mm512_fmsub_round_pd:
3036; X86:       # %bb.0: # %entry
3037; X86-NEXT:    vpxorq {{\.LCPI.*}}{1to8}, %zmm2, %zmm2
3038; X86-NEXT:    vfmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0
3039; X86-NEXT:    retl
3040;
3041; X64-LABEL: test_mm512_fmsub_round_pd:
3042; X64:       # %bb.0: # %entry
3043; X64-NEXT:    vpxorq {{.*}}(%rip){1to8}, %zmm2, %zmm2
3044; X64-NEXT:    vfmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0
3045; X64-NEXT:    retq
3046entry:
3047  %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
3048  %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub, i32 8)
3049  ret <8 x double> %0
3050}
3051
3052define <8 x double> @test_mm512_mask_fmsub_round_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) {
3053; X86-LABEL: test_mm512_mask_fmsub_round_pd:
3054; X86:       # %bb.0: # %entry
3055; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
3056; X86-NEXT:    kmovw %eax, %k1
3057; X86-NEXT:    vfmsub132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
3058; X86-NEXT:    retl
3059;
3060; X64-LABEL: test_mm512_mask_fmsub_round_pd:
3061; X64:       # %bb.0: # %entry
3062; X64-NEXT:    kmovw %edi, %k1
3063; X64-NEXT:    vfmsub132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
3064; X64-NEXT:    retq
3065entry:
3066  %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
3067  %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub, i32 8)
3068  %1 = bitcast i8 %__U to <8 x i1>
3069  %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__A
3070  ret <8 x double> %2
3071}
3072
3073define <8 x double> @test_mm512_maskz_fmsub_round_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
3074; X86-LABEL: test_mm512_maskz_fmsub_round_pd:
3075; X86:       # %bb.0: # %entry
3076; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
3077; X86-NEXT:    kmovw %eax, %k1
3078; X86-NEXT:    vfmsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
3079; X86-NEXT:    retl
3080;
3081; X64-LABEL: test_mm512_maskz_fmsub_round_pd:
3082; X64:       # %bb.0: # %entry
3083; X64-NEXT:    kmovw %edi, %k1
3084; X64-NEXT:    vfmsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
3085; X64-NEXT:    retq
3086entry:
3087  %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
3088  %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub, i32 8)
3089  %1 = bitcast i8 %__U to <8 x i1>
3090  %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer
3091  ret <8 x double> %2
3092}
3093
3094define <8 x double> @test_mm512_fnmadd_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
3095; X86-LABEL: test_mm512_fnmadd_round_pd:
3096; X86:       # %bb.0: # %entry
3097; X86-NEXT:    vpxorq {{\.LCPI.*}}{1to8}, %zmm0, %zmm0
3098; X86-NEXT:    vfmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0
3099; X86-NEXT:    retl
3100;
3101; X64-LABEL: test_mm512_fnmadd_round_pd:
3102; X64:       # %bb.0: # %entry
3103; X64-NEXT:    vpxorq {{.*}}(%rip){1to8}, %zmm0, %zmm0
3104; X64-NEXT:    vfmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0
3105; X64-NEXT:    retq
3106entry:
3107  %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A
3108  %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %sub, <8 x double> %__B, <8 x double> %__C, i32 8)
3109  ret <8 x double> %0
3110}
3111
3112define <8 x double> @test_mm512_mask3_fnmadd_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) {
3113; X86-LABEL: test_mm512_mask3_fnmadd_round_pd:
3114; X86:       # %bb.0: # %entry
3115; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
3116; X86-NEXT:    kmovw %eax, %k1
3117; X86-NEXT:    vfnmadd231pd {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
3118; X86-NEXT:    vmovapd %zmm2, %zmm0
3119; X86-NEXT:    retl
3120;
3121; X64-LABEL: test_mm512_mask3_fnmadd_round_pd:
3122; X64:       # %bb.0: # %entry
3123; X64-NEXT:    kmovw %edi, %k1
3124; X64-NEXT:    vfnmadd231pd {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
3125; X64-NEXT:    vmovapd %zmm2, %zmm0
3126; X64-NEXT:    retq
3127entry:
3128  %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A
3129  %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %sub, <8 x double> %__B, <8 x double> %__C, i32 8)
3130  %1 = bitcast i8 %__U to <8 x i1>
3131  %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__C
3132  ret <8 x double> %2
3133}
3134
3135define <8 x double> @test_mm512_maskz_fnmadd_round_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
3136; X86-LABEL: test_mm512_maskz_fnmadd_round_pd:
3137; X86:       # %bb.0: # %entry
3138; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
3139; X86-NEXT:    kmovw %eax, %k1
3140; X86-NEXT:    vfnmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
3141; X86-NEXT:    retl
3142;
3143; X64-LABEL: test_mm512_maskz_fnmadd_round_pd:
3144; X64:       # %bb.0: # %entry
3145; X64-NEXT:    kmovw %edi, %k1
3146; X64-NEXT:    vfnmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
3147; X64-NEXT:    retq
3148entry:
3149  %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A
3150  %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %sub, <8 x double> %__B, <8 x double> %__C, i32 8)
3151  %1 = bitcast i8 %__U to <8 x i1>
3152  %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer
3153  ret <8 x double> %2
3154}
3155
3156define <8 x double> @test_mm512_fnmsub_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
3157; CHECK-LABEL: test_mm512_fnmsub_round_pd:
3158; CHECK:       # %bb.0: # %entry
3159; CHECK-NEXT:    vpbroadcastq {{.*#+}} zmm3 = [-0,-0,-0,-0,-0,-0,-0,-0]
3160; CHECK-NEXT:    vpxorq %zmm3, %zmm0, %zmm4
3161; CHECK-NEXT:    vpxorq %zmm3, %zmm2, %zmm0
3162; CHECK-NEXT:    vfmadd231pd {rn-sae}, %zmm4, %zmm1, %zmm0
3163; CHECK-NEXT:    ret{{[l|q]}}
3164entry:
3165  %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A
3166  %sub1 = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
3167  %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %sub, <8 x double> %__B, <8 x double> %sub1, i32 8)
3168  ret <8 x double> %0
3169}
3170
3171define <8 x double> @test_mm512_maskz_fnmsub_round_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
3172; X86-LABEL: test_mm512_maskz_fnmsub_round_pd:
3173; X86:       # %bb.0: # %entry
3174; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
3175; X86-NEXT:    kmovw %eax, %k1
3176; X86-NEXT:    vfnmsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
3177; X86-NEXT:    retl
3178;
3179; X64-LABEL: test_mm512_maskz_fnmsub_round_pd:
3180; X64:       # %bb.0: # %entry
3181; X64-NEXT:    kmovw %edi, %k1
3182; X64-NEXT:    vfnmsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
3183; X64-NEXT:    retq
3184entry:
3185  %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A
3186  %sub1 = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
3187  %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %sub, <8 x double> %__B, <8 x double> %sub1, i32 8)
3188  %1 = bitcast i8 %__U to <8 x i1>
3189  %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer
3190  ret <8 x double> %2
3191}
3192
3193define <8 x double> @test_mm512_fmadd_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
3194; CHECK-LABEL: test_mm512_fmadd_pd:
3195; CHECK:       # %bb.0: # %entry
3196; CHECK-NEXT:    vfmadd213pd {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2
3197; CHECK-NEXT:    ret{{[l|q]}}
3198entry:
3199  %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) #10
3200  ret <8 x double> %0
3201}
3202
3203define <8 x double> @test_mm512_mask_fmadd_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) {
3204; X86-LABEL: test_mm512_mask_fmadd_pd:
3205; X86:       # %bb.0: # %entry
3206; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
3207; X86-NEXT:    kmovw %eax, %k1
3208; X86-NEXT:    vfmadd132pd {{.*#+}} zmm0 = (zmm0 * zmm1) + zmm2
3209; X86-NEXT:    retl
3210;
3211; X64-LABEL: test_mm512_mask_fmadd_pd:
3212; X64:       # %bb.0: # %entry
3213; X64-NEXT:    kmovw %edi, %k1
3214; X64-NEXT:    vfmadd132pd {{.*#+}} zmm0 = (zmm0 * zmm1) + zmm2
3215; X64-NEXT:    retq
3216entry:
3217  %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) #10
3218  %1 = bitcast i8 %__U to <8 x i1>
3219  %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__A
3220  ret <8 x double> %2
3221}
3222
3223define <8 x double> @test_mm512_mask3_fmadd_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) {
3224; X86-LABEL: test_mm512_mask3_fmadd_pd:
3225; X86:       # %bb.0: # %entry
3226; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
3227; X86-NEXT:    kmovw %eax, %k1
3228; X86-NEXT:    vfmadd231pd {{.*#+}} zmm2 = (zmm0 * zmm1) + zmm2
3229; X86-NEXT:    vmovapd %zmm2, %zmm0
3230; X86-NEXT:    retl
3231;
3232; X64-LABEL: test_mm512_mask3_fmadd_pd:
3233; X64:       # %bb.0: # %entry
3234; X64-NEXT:    kmovw %edi, %k1
3235; X64-NEXT:    vfmadd231pd {{.*#+}} zmm2 = (zmm0 * zmm1) + zmm2
3236; X64-NEXT:    vmovapd %zmm2, %zmm0
3237; X64-NEXT:    retq
3238entry:
3239  %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) #10
3240  %1 = bitcast i8 %__U to <8 x i1>
3241  %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__C
3242  ret <8 x double> %2
3243}
3244
3245define <8 x double> @test_mm512_maskz_fmadd_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
3246; X86-LABEL: test_mm512_maskz_fmadd_pd:
3247; X86:       # %bb.0: # %entry
3248; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
3249; X86-NEXT:    kmovw %eax, %k1
3250; X86-NEXT:    vfmadd213pd {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2
3251; X86-NEXT:    retl
3252;
3253; X64-LABEL: test_mm512_maskz_fmadd_pd:
3254; X64:       # %bb.0: # %entry
3255; X64-NEXT:    kmovw %edi, %k1
3256; X64-NEXT:    vfmadd213pd {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2
3257; X64-NEXT:    retq
3258entry:
3259  %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) #10
3260  %1 = bitcast i8 %__U to <8 x i1>
3261  %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer
3262  ret <8 x double> %2
3263}
3264
3265define <8 x double> @test_mm512_fmsub_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
3266; X86-LABEL: test_mm512_fmsub_pd:
3267; X86:       # %bb.0: # %entry
3268; X86-NEXT:    vpxorq {{\.LCPI.*}}{1to8}, %zmm2, %zmm2
3269; X86-NEXT:    vfmadd213pd {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2
3270; X86-NEXT:    retl
3271;
3272; X64-LABEL: test_mm512_fmsub_pd:
3273; X64:       # %bb.0: # %entry
3274; X64-NEXT:    vpxorq {{.*}}(%rip){1to8}, %zmm2, %zmm2
3275; X64-NEXT:    vfmadd213pd {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2
3276; X64-NEXT:    retq
3277entry:
3278  %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
3279  %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub.i) #10
3280  ret <8 x double> %0
3281}
3282
3283define <8 x double> @test_mm512_mask_fmsub_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) {
3284; X86-LABEL: test_mm512_mask_fmsub_pd:
3285; X86:       # %bb.0: # %entry
3286; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
3287; X86-NEXT:    kmovw %eax, %k1
3288; X86-NEXT:    vfmsub132pd {{.*#+}} zmm0 = (zmm0 * zmm1) - zmm2
3289; X86-NEXT:    retl
3290;
3291; X64-LABEL: test_mm512_mask_fmsub_pd:
3292; X64:       # %bb.0: # %entry
3293; X64-NEXT:    kmovw %edi, %k1
3294; X64-NEXT:    vfmsub132pd {{.*#+}} zmm0 = (zmm0 * zmm1) - zmm2
3295; X64-NEXT:    retq
3296entry:
3297  %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
3298  %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub.i) #10
3299  %1 = bitcast i8 %__U to <8 x i1>
3300  %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__A
3301  ret <8 x double> %2
3302}
3303
3304define <8 x double> @test_mm512_maskz_fmsub_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
3305; X86-LABEL: test_mm512_maskz_fmsub_pd:
3306; X86:       # %bb.0: # %entry
3307; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
3308; X86-NEXT:    kmovw %eax, %k1
3309; X86-NEXT:    vfmsub213pd {{.*#+}} zmm0 = (zmm1 * zmm0) - zmm2
3310; X86-NEXT:    retl
3311;
3312; X64-LABEL: test_mm512_maskz_fmsub_pd:
3313; X64:       # %bb.0: # %entry
3314; X64-NEXT:    kmovw %edi, %k1
3315; X64-NEXT:    vfmsub213pd {{.*#+}} zmm0 = (zmm1 * zmm0) - zmm2
3316; X64-NEXT:    retq
3317entry:
3318  %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
3319  %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub.i) #10
3320  %1 = bitcast i8 %__U to <8 x i1>
3321  %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer
3322  ret <8 x double> %2
3323}
3324
3325define <8 x double> @test_mm512_fnmadd_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
3326; X86-LABEL: test_mm512_fnmadd_pd:
3327; X86:       # %bb.0: # %entry
3328; X86-NEXT:    vpxorq {{\.LCPI.*}}{1to8}, %zmm0, %zmm0
3329; X86-NEXT:    vfmadd213pd {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2
3330; X86-NEXT:    retl
3331;
3332; X64-LABEL: test_mm512_fnmadd_pd:
3333; X64:       # %bb.0: # %entry
3334; X64-NEXT:    vpxorq {{.*}}(%rip){1to8}, %zmm0, %zmm0
3335; X64-NEXT:    vfmadd213pd {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2
3336; X64-NEXT:    retq
3337entry:
3338  %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A
3339  %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %sub.i, <8 x double> %__B, <8 x double> %__C) #10
3340  ret <8 x double> %0
3341}
3342
3343define <8 x double> @test_mm512_mask3_fnmadd_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) {
3344; X86-LABEL: test_mm512_mask3_fnmadd_pd:
3345; X86:       # %bb.0: # %entry
3346; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
3347; X86-NEXT:    kmovw %eax, %k1
3348; X86-NEXT:    vfnmadd231pd {{.*#+}} zmm2 = -(zmm0 * zmm1) + zmm2
3349; X86-NEXT:    vmovapd %zmm2, %zmm0
3350; X86-NEXT:    retl
3351;
3352; X64-LABEL: test_mm512_mask3_fnmadd_pd:
3353; X64:       # %bb.0: # %entry
3354; X64-NEXT:    kmovw %edi, %k1
3355; X64-NEXT:    vfnmadd231pd {{.*#+}} zmm2 = -(zmm0 * zmm1) + zmm2
3356; X64-NEXT:    vmovapd %zmm2, %zmm0
3357; X64-NEXT:    retq
3358entry:
3359  %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A
3360  %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %sub.i, <8 x double> %__B, <8 x double> %__C) #10
3361  %1 = bitcast i8 %__U to <8 x i1>
3362  %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__C
3363  ret <8 x double> %2
3364}
3365
3366define <8 x double> @test_mm512_maskz_fnmadd_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
3367; X86-LABEL: test_mm512_maskz_fnmadd_pd:
3368; X86:       # %bb.0: # %entry
3369; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
3370; X86-NEXT:    kmovw %eax, %k1
3371; X86-NEXT:    vfnmadd213pd {{.*#+}} zmm0 = -(zmm1 * zmm0) + zmm2
3372; X86-NEXT:    retl
3373;
3374; X64-LABEL: test_mm512_maskz_fnmadd_pd:
3375; X64:       # %bb.0: # %entry
3376; X64-NEXT:    kmovw %edi, %k1
3377; X64-NEXT:    vfnmadd213pd {{.*#+}} zmm0 = -(zmm1 * zmm0) + zmm2
3378; X64-NEXT:    retq
3379entry:
3380  %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A
3381  %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %sub.i, <8 x double> %__B, <8 x double> %__C) #10
3382  %1 = bitcast i8 %__U to <8 x i1>
3383  %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer
3384  ret <8 x double> %2
3385}
3386
3387define <8 x double> @test_mm512_fnmsub_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
3388; CHECK-LABEL: test_mm512_fnmsub_pd:
3389; CHECK:       # %bb.0: # %entry
3390; CHECK-NEXT:    vpbroadcastq {{.*#+}} zmm3 = [-0,-0,-0,-0,-0,-0,-0,-0]
3391; CHECK-NEXT:    vpxorq %zmm3, %zmm0, %zmm4
3392; CHECK-NEXT:    vpxorq %zmm3, %zmm2, %zmm0
3393; CHECK-NEXT:    vfmadd231pd {{.*#+}} zmm0 = (zmm1 * zmm4) + zmm0
3394; CHECK-NEXT:    ret{{[l|q]}}
3395entry:
3396  %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A
3397  %sub1.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
3398  %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %sub.i, <8 x double> %__B, <8 x double> %sub1.i) #10
3399  ret <8 x double> %0
3400}
3401
3402define <8 x double> @test_mm512_maskz_fnmsub_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
3403; X86-LABEL: test_mm512_maskz_fnmsub_pd:
3404; X86:       # %bb.0: # %entry
3405; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
3406; X86-NEXT:    kmovw %eax, %k1
3407; X86-NEXT:    vfnmsub213pd {{.*#+}} zmm0 = -(zmm1 * zmm0) - zmm2
3408; X86-NEXT:    retl
3409;
3410; X64-LABEL: test_mm512_maskz_fnmsub_pd:
3411; X64:       # %bb.0: # %entry
3412; X64-NEXT:    kmovw %edi, %k1
3413; X64-NEXT:    vfnmsub213pd {{.*#+}} zmm0 = -(zmm1 * zmm0) - zmm2
3414; X64-NEXT:    retq
3415entry:
3416  %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A
3417  %sub1.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
3418  %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %sub.i, <8 x double> %__B, <8 x double> %sub1.i) #10
3419  %1 = bitcast i8 %__U to <8 x i1>
3420  %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer
3421  ret <8 x double> %2
3422}
3423
3424define <16 x float> @test_mm512_fmadd_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
3425; CHECK-LABEL: test_mm512_fmadd_round_ps:
3426; CHECK:       # %bb.0: # %entry
3427; CHECK-NEXT:    vfmadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0
3428; CHECK-NEXT:    ret{{[l|q]}}
3429entry:
3430  %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i32 8)
3431  ret <16 x float> %0
3432}
3433
3434declare <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float>, <16 x float>, <16 x float>, i32) #1
3435
3436define <16 x float> @test_mm512_mask_fmadd_round_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) {
3437; X86-LABEL: test_mm512_mask_fmadd_round_ps:
3438; X86:       # %bb.0: # %entry
3439; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
3440; X86-NEXT:    vfmadd132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
3441; X86-NEXT:    retl
3442;
3443; X64-LABEL: test_mm512_mask_fmadd_round_ps:
3444; X64:       # %bb.0: # %entry
3445; X64-NEXT:    kmovw %edi, %k1
3446; X64-NEXT:    vfmadd132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
3447; X64-NEXT:    retq
3448entry:
3449  %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i32 8)
3450  %1 = bitcast i16 %__U to <16 x i1>
3451  %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__A
3452  ret <16 x float> %2
3453}
3454
3455define <16 x float> @test_mm512_mask3_fmadd_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) {
3456; X86-LABEL: test_mm512_mask3_fmadd_round_ps:
3457; X86:       # %bb.0: # %entry
3458; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
3459; X86-NEXT:    vfmadd231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
3460; X86-NEXT:    vmovaps %zmm2, %zmm0
3461; X86-NEXT:    retl
3462;
3463; X64-LABEL: test_mm512_mask3_fmadd_round_ps:
3464; X64:       # %bb.0: # %entry
3465; X64-NEXT:    kmovw %edi, %k1
3466; X64-NEXT:    vfmadd231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
3467; X64-NEXT:    vmovaps %zmm2, %zmm0
3468; X64-NEXT:    retq
3469entry:
3470  %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i32 8)
3471  %1 = bitcast i16 %__U to <16 x i1>
3472  %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__C
3473  ret <16 x float> %2
3474}
3475
3476define <16 x float> @test_mm512_maskz_fmadd_round_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
3477; X86-LABEL: test_mm512_maskz_fmadd_round_ps:
3478; X86:       # %bb.0: # %entry
3479; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
3480; X86-NEXT:    vfmadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
3481; X86-NEXT:    retl
3482;
3483; X64-LABEL: test_mm512_maskz_fmadd_round_ps:
3484; X64:       # %bb.0: # %entry
3485; X64-NEXT:    kmovw %edi, %k1
3486; X64-NEXT:    vfmadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
3487; X64-NEXT:    retq
3488entry:
3489  %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i32 8)
3490  %1 = bitcast i16 %__U to <16 x i1>
3491  %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer
3492  ret <16 x float> %2
3493}
3494
3495define <16 x float> @test_mm512_fmsub_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
3496; X86-LABEL: test_mm512_fmsub_round_ps:
3497; X86:       # %bb.0: # %entry
3498; X86-NEXT:    vpxord {{\.LCPI.*}}{1to16}, %zmm2, %zmm2
3499; X86-NEXT:    vfmadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0
3500; X86-NEXT:    retl
3501;
3502; X64-LABEL: test_mm512_fmsub_round_ps:
3503; X64:       # %bb.0: # %entry
3504; X64-NEXT:    vpxord {{.*}}(%rip){1to16}, %zmm2, %zmm2
3505; X64-NEXT:    vfmadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0
3506; X64-NEXT:    retq
3507entry:
3508  %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
3509  %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub, i32 8)
3510  ret <16 x float> %0
3511}
3512
3513define <16 x float> @test_mm512_mask_fmsub_round_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) {
3514; X86-LABEL: test_mm512_mask_fmsub_round_ps:
3515; X86:       # %bb.0: # %entry
3516; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
3517; X86-NEXT:    vfmsub132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
3518; X86-NEXT:    retl
3519;
3520; X64-LABEL: test_mm512_mask_fmsub_round_ps:
3521; X64:       # %bb.0: # %entry
3522; X64-NEXT:    kmovw %edi, %k1
3523; X64-NEXT:    vfmsub132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
3524; X64-NEXT:    retq
3525entry:
3526  %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
3527  %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub, i32 8)
3528  %1 = bitcast i16 %__U to <16 x i1>
3529  %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__A
3530  ret <16 x float> %2
3531}
3532
3533define <16 x float> @test_mm512_maskz_fmsub_round_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
3534; X86-LABEL: test_mm512_maskz_fmsub_round_ps:
3535; X86:       # %bb.0: # %entry
3536; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
3537; X86-NEXT:    vfmsub213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
3538; X86-NEXT:    retl
3539;
3540; X64-LABEL: test_mm512_maskz_fmsub_round_ps:
3541; X64:       # %bb.0: # %entry
3542; X64-NEXT:    kmovw %edi, %k1
3543; X64-NEXT:    vfmsub213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
3544; X64-NEXT:    retq
3545entry:
3546  %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
3547  %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub, i32 8)
3548  %1 = bitcast i16 %__U to <16 x i1>
3549  %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer
3550  ret <16 x float> %2
3551}
3552
3553define <16 x float> @test_mm512_fnmadd_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
3554; X86-LABEL: test_mm512_fnmadd_round_ps:
3555; X86:       # %bb.0: # %entry
3556; X86-NEXT:    vpxord {{\.LCPI.*}}{1to16}, %zmm0, %zmm0
3557; X86-NEXT:    vfmadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0
3558; X86-NEXT:    retl
3559;
3560; X64-LABEL: test_mm512_fnmadd_round_ps:
3561; X64:       # %bb.0: # %entry
3562; X64-NEXT:    vpxord {{.*}}(%rip){1to16}, %zmm0, %zmm0
3563; X64-NEXT:    vfmadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0
3564; X64-NEXT:    retq
3565entry:
3566  %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
3567  %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %sub, <16 x float> %__B, <16 x float> %__C, i32 8)
3568  ret <16 x float> %0
3569}
3570
3571define <16 x float> @test_mm512_mask3_fnmadd_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) {
3572; X86-LABEL: test_mm512_mask3_fnmadd_round_ps:
3573; X86:       # %bb.0: # %entry
3574; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
3575; X86-NEXT:    vfnmadd231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
3576; X86-NEXT:    vmovaps %zmm2, %zmm0
3577; X86-NEXT:    retl
3578;
3579; X64-LABEL: test_mm512_mask3_fnmadd_round_ps:
3580; X64:       # %bb.0: # %entry
3581; X64-NEXT:    kmovw %edi, %k1
3582; X64-NEXT:    vfnmadd231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
3583; X64-NEXT:    vmovaps %zmm2, %zmm0
3584; X64-NEXT:    retq
3585entry:
3586  %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
3587  %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %sub, <16 x float> %__B, <16 x float> %__C, i32 8)
3588  %1 = bitcast i16 %__U to <16 x i1>
3589  %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__C
3590  ret <16 x float> %2
3591}
3592
3593define <16 x float> @test_mm512_maskz_fnmadd_round_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
3594; X86-LABEL: test_mm512_maskz_fnmadd_round_ps:
3595; X86:       # %bb.0: # %entry
3596; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
3597; X86-NEXT:    vfnmadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
3598; X86-NEXT:    retl
3599;
3600; X64-LABEL: test_mm512_maskz_fnmadd_round_ps:
3601; X64:       # %bb.0: # %entry
3602; X64-NEXT:    kmovw %edi, %k1
3603; X64-NEXT:    vfnmadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
3604; X64-NEXT:    retq
3605entry:
3606  %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
3607  %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %sub, <16 x float> %__B, <16 x float> %__C, i32 8)
3608  %1 = bitcast i16 %__U to <16 x i1>
3609  %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer
3610  ret <16 x float> %2
3611}
3612
3613define <16 x float> @test_mm512_fnmsub_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
3614; CHECK-LABEL: test_mm512_fnmsub_round_ps:
3615; CHECK:       # %bb.0: # %entry
3616; CHECK-NEXT:    vpbroadcastd {{.*#+}} zmm3 = [-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0]
3617; CHECK-NEXT:    vpxorq %zmm3, %zmm0, %zmm4
3618; CHECK-NEXT:    vpxorq %zmm3, %zmm2, %zmm0
3619; CHECK-NEXT:    vfmadd231ps {rn-sae}, %zmm4, %zmm1, %zmm0
3620; CHECK-NEXT:    ret{{[l|q]}}
3621entry:
3622  %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
3623  %sub1 = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
3624  %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %sub, <16 x float> %__B, <16 x float> %sub1, i32 8)
3625  ret <16 x float> %0
3626}
3627
3628define <16 x float> @test_mm512_maskz_fnmsub_round_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
3629; X86-LABEL: test_mm512_maskz_fnmsub_round_ps:
3630; X86:       # %bb.0: # %entry
3631; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
3632; X86-NEXT:    vfnmsub213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
3633; X86-NEXT:    retl
3634;
3635; X64-LABEL: test_mm512_maskz_fnmsub_round_ps:
3636; X64:       # %bb.0: # %entry
3637; X64-NEXT:    kmovw %edi, %k1
3638; X64-NEXT:    vfnmsub213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
3639; X64-NEXT:    retq
3640entry:
3641  %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
3642  %sub1 = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
3643  %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %sub, <16 x float> %__B, <16 x float> %sub1, i32 8)
3644  %1 = bitcast i16 %__U to <16 x i1>
3645  %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer
3646  ret <16 x float> %2
3647}
3648
3649define <16 x float> @test_mm512_fmadd_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
3650; CHECK-LABEL: test_mm512_fmadd_ps:
3651; CHECK:       # %bb.0: # %entry
3652; CHECK-NEXT:    vfmadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2
3653; CHECK-NEXT:    ret{{[l|q]}}
3654entry:
3655  %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) #10
3656  ret <16 x float> %0
3657}
3658
3659define <16 x float> @test_mm512_mask_fmadd_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) {
3660; X86-LABEL: test_mm512_mask_fmadd_ps:
3661; X86:       # %bb.0: # %entry
3662; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
3663; X86-NEXT:    vfmadd132ps {{.*#+}} zmm0 = (zmm0 * zmm1) + zmm2
3664; X86-NEXT:    retl
3665;
3666; X64-LABEL: test_mm512_mask_fmadd_ps:
3667; X64:       # %bb.0: # %entry
3668; X64-NEXT:    kmovw %edi, %k1
3669; X64-NEXT:    vfmadd132ps {{.*#+}} zmm0 = (zmm0 * zmm1) + zmm2
3670; X64-NEXT:    retq
3671entry:
3672  %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) #10
3673  %1 = bitcast i16 %__U to <16 x i1>
3674  %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__A
3675  ret <16 x float> %2
3676}
3677
3678define <16 x float> @test_mm512_mask3_fmadd_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) {
3679; X86-LABEL: test_mm512_mask3_fmadd_ps:
3680; X86:       # %bb.0: # %entry
3681; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
3682; X86-NEXT:    vfmadd231ps {{.*#+}} zmm2 = (zmm0 * zmm1) + zmm2
3683; X86-NEXT:    vmovaps %zmm2, %zmm0
3684; X86-NEXT:    retl
3685;
3686; X64-LABEL: test_mm512_mask3_fmadd_ps:
3687; X64:       # %bb.0: # %entry
3688; X64-NEXT:    kmovw %edi, %k1
3689; X64-NEXT:    vfmadd231ps {{.*#+}} zmm2 = (zmm0 * zmm1) + zmm2
3690; X64-NEXT:    vmovaps %zmm2, %zmm0
3691; X64-NEXT:    retq
3692entry:
3693  %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) #10
3694  %1 = bitcast i16 %__U to <16 x i1>
3695  %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__C
3696  ret <16 x float> %2
3697}
3698
3699define <16 x float> @test_mm512_maskz_fmadd_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
3700; X86-LABEL: test_mm512_maskz_fmadd_ps:
3701; X86:       # %bb.0: # %entry
3702; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
3703; X86-NEXT:    vfmadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2
3704; X86-NEXT:    retl
3705;
3706; X64-LABEL: test_mm512_maskz_fmadd_ps:
3707; X64:       # %bb.0: # %entry
3708; X64-NEXT:    kmovw %edi, %k1
3709; X64-NEXT:    vfmadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2
3710; X64-NEXT:    retq
3711entry:
3712  %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) #10
3713  %1 = bitcast i16 %__U to <16 x i1>
3714  %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer
3715  ret <16 x float> %2
3716}
3717
3718define <16 x float> @test_mm512_fmsub_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
3719; X86-LABEL: test_mm512_fmsub_ps:
3720; X86:       # %bb.0: # %entry
3721; X86-NEXT:    vpxord {{\.LCPI.*}}{1to16}, %zmm2, %zmm2
3722; X86-NEXT:    vfmadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2
3723; X86-NEXT:    retl
3724;
3725; X64-LABEL: test_mm512_fmsub_ps:
3726; X64:       # %bb.0: # %entry
3727; X64-NEXT:    vpxord {{.*}}(%rip){1to16}, %zmm2, %zmm2
3728; X64-NEXT:    vfmadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2
3729; X64-NEXT:    retq
3730entry:
3731  %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
3732  %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub.i) #10
3733  ret <16 x float> %0
3734}
3735
3736define <16 x float> @test_mm512_mask_fmsub_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) {
3737; X86-LABEL: test_mm512_mask_fmsub_ps:
3738; X86:       # %bb.0: # %entry
3739; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
3740; X86-NEXT:    vfmsub132ps {{.*#+}} zmm0 = (zmm0 * zmm1) - zmm2
3741; X86-NEXT:    retl
3742;
3743; X64-LABEL: test_mm512_mask_fmsub_ps:
3744; X64:       # %bb.0: # %entry
3745; X64-NEXT:    kmovw %edi, %k1
3746; X64-NEXT:    vfmsub132ps {{.*#+}} zmm0 = (zmm0 * zmm1) - zmm2
3747; X64-NEXT:    retq
3748entry:
3749  %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
3750  %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub.i) #10
3751  %1 = bitcast i16 %__U to <16 x i1>
3752  %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__A
3753  ret <16 x float> %2
3754}
3755
3756define <16 x float> @test_mm512_maskz_fmsub_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
3757; X86-LABEL: test_mm512_maskz_fmsub_ps:
3758; X86:       # %bb.0: # %entry
3759; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
3760; X86-NEXT:    vfmsub213ps {{.*#+}} zmm0 = (zmm1 * zmm0) - zmm2
3761; X86-NEXT:    retl
3762;
3763; X64-LABEL: test_mm512_maskz_fmsub_ps:
3764; X64:       # %bb.0: # %entry
3765; X64-NEXT:    kmovw %edi, %k1
3766; X64-NEXT:    vfmsub213ps {{.*#+}} zmm0 = (zmm1 * zmm0) - zmm2
3767; X64-NEXT:    retq
3768entry:
3769  %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
3770  %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub.i) #10
3771  %1 = bitcast i16 %__U to <16 x i1>
3772  %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer
3773  ret <16 x float> %2
3774}
3775
3776define <16 x float> @test_mm512_fnmadd_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
3777; X86-LABEL: test_mm512_fnmadd_ps:
3778; X86:       # %bb.0: # %entry
3779; X86-NEXT:    vpxord {{\.LCPI.*}}{1to16}, %zmm0, %zmm0
3780; X86-NEXT:    vfmadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2
3781; X86-NEXT:    retl
3782;
3783; X64-LABEL: test_mm512_fnmadd_ps:
3784; X64:       # %bb.0: # %entry
3785; X64-NEXT:    vpxord {{.*}}(%rip){1to16}, %zmm0, %zmm0
3786; X64-NEXT:    vfmadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2
3787; X64-NEXT:    retq
3788entry:
3789  %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
3790  %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %sub.i, <16 x float> %__B, <16 x float> %__C) #10
3791  ret <16 x float> %0
3792}
3793
3794define <16 x float> @test_mm512_mask3_fnmadd_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) {
3795; X86-LABEL: test_mm512_mask3_fnmadd_ps:
3796; X86:       # %bb.0: # %entry
3797; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
3798; X86-NEXT:    vfnmadd231ps {{.*#+}} zmm2 = -(zmm0 * zmm1) + zmm2
3799; X86-NEXT:    vmovaps %zmm2, %zmm0
3800; X86-NEXT:    retl
3801;
3802; X64-LABEL: test_mm512_mask3_fnmadd_ps:
3803; X64:       # %bb.0: # %entry
3804; X64-NEXT:    kmovw %edi, %k1
3805; X64-NEXT:    vfnmadd231ps {{.*#+}} zmm2 = -(zmm0 * zmm1) + zmm2
3806; X64-NEXT:    vmovaps %zmm2, %zmm0
3807; X64-NEXT:    retq
3808entry:
3809  %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
3810  %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %sub.i, <16 x float> %__B, <16 x float> %__C) #10
3811  %1 = bitcast i16 %__U to <16 x i1>
3812  %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__C
3813  ret <16 x float> %2
3814}
3815
3816define <16 x float> @test_mm512_maskz_fnmadd_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
3817; X86-LABEL: test_mm512_maskz_fnmadd_ps:
3818; X86:       # %bb.0: # %entry
3819; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
3820; X86-NEXT:    vfnmadd213ps {{.*#+}} zmm0 = -(zmm1 * zmm0) + zmm2
3821; X86-NEXT:    retl
3822;
3823; X64-LABEL: test_mm512_maskz_fnmadd_ps:
3824; X64:       # %bb.0: # %entry
3825; X64-NEXT:    kmovw %edi, %k1
3826; X64-NEXT:    vfnmadd213ps {{.*#+}} zmm0 = -(zmm1 * zmm0) + zmm2
3827; X64-NEXT:    retq
3828entry:
3829  %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
3830  %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %sub.i, <16 x float> %__B, <16 x float> %__C) #10
3831  %1 = bitcast i16 %__U to <16 x i1>
3832  %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer
3833  ret <16 x float> %2
3834}
3835
3836define <16 x float> @test_mm512_fnmsub_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
3837; CHECK-LABEL: test_mm512_fnmsub_ps:
3838; CHECK:       # %bb.0: # %entry
3839; CHECK-NEXT:    vpbroadcastd {{.*#+}} zmm3 = [-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0]
3840; CHECK-NEXT:    vpxorq %zmm3, %zmm0, %zmm4
3841; CHECK-NEXT:    vpxorq %zmm3, %zmm2, %zmm0
3842; CHECK-NEXT:    vfmadd231ps {{.*#+}} zmm0 = (zmm1 * zmm4) + zmm0
3843; CHECK-NEXT:    ret{{[l|q]}}
3844entry:
3845  %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
3846  %sub1.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
3847  %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %sub.i, <16 x float> %__B, <16 x float> %sub1.i) #10
3848  ret <16 x float> %0
3849}
3850
3851define <16 x float> @test_mm512_maskz_fnmsub_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
3852; X86-LABEL: test_mm512_maskz_fnmsub_ps:
3853; X86:       # %bb.0: # %entry
3854; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
3855; X86-NEXT:    vfnmsub213ps {{.*#+}} zmm0 = -(zmm1 * zmm0) - zmm2
3856; X86-NEXT:    retl
3857;
3858; X64-LABEL: test_mm512_maskz_fnmsub_ps:
3859; X64:       # %bb.0: # %entry
3860; X64-NEXT:    kmovw %edi, %k1
3861; X64-NEXT:    vfnmsub213ps {{.*#+}} zmm0 = -(zmm1 * zmm0) - zmm2
3862; X64-NEXT:    retq
3863entry:
3864  %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
3865  %sub1.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
3866  %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %sub.i, <16 x float> %__B, <16 x float> %sub1.i) #10
3867  %1 = bitcast i16 %__U to <16 x i1>
3868  %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer
3869  ret <16 x float> %2
3870}
3871
3872define <8 x double> @test_mm512_fmaddsub_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
3873; CHECK-LABEL: test_mm512_fmaddsub_round_pd:
3874; CHECK:       # %bb.0: # %entry
3875; CHECK-NEXT:    vfmaddsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0
3876; CHECK-NEXT:    ret{{[l|q]}}
3877entry:
3878  %0 = tail call <8 x double> @llvm.x86.avx512.vfmaddsub.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i32 8)
3879  ret <8 x double> %0
3880}
3881
3882declare <8 x double> @llvm.x86.avx512.vfmaddsub.pd.512(<8 x double>, <8 x double>, <8 x double>, i32) #1
3883
3884define <8 x double> @test_mm512_mask_fmaddsub_round_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) {
3885; X86-LABEL: test_mm512_mask_fmaddsub_round_pd:
3886; X86:       # %bb.0: # %entry
3887; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
3888; X86-NEXT:    kmovw %eax, %k1
3889; X86-NEXT:    vfmaddsub132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
3890; X86-NEXT:    retl
3891;
3892; X64-LABEL: test_mm512_mask_fmaddsub_round_pd:
3893; X64:       # %bb.0: # %entry
3894; X64-NEXT:    kmovw %edi, %k1
3895; X64-NEXT:    vfmaddsub132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
3896; X64-NEXT:    retq
3897entry:
3898  %0 = tail call <8 x double> @llvm.x86.avx512.vfmaddsub.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i32 8)
3899  %1 = bitcast i8 %__U to <8 x i1>
3900  %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__A
3901  ret <8 x double> %2
3902}
3903
3904define <8 x double> @test_mm512_mask3_fmaddsub_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) {
3905; X86-LABEL: test_mm512_mask3_fmaddsub_round_pd:
3906; X86:       # %bb.0: # %entry
3907; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
3908; X86-NEXT:    kmovw %eax, %k1
3909; X86-NEXT:    vfmaddsub231pd {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
3910; X86-NEXT:    vmovapd %zmm2, %zmm0
3911; X86-NEXT:    retl
3912;
3913; X64-LABEL: test_mm512_mask3_fmaddsub_round_pd:
3914; X64:       # %bb.0: # %entry
3915; X64-NEXT:    kmovw %edi, %k1
3916; X64-NEXT:    vfmaddsub231pd {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
3917; X64-NEXT:    vmovapd %zmm2, %zmm0
3918; X64-NEXT:    retq
3919entry:
3920  %0 = tail call <8 x double> @llvm.x86.avx512.vfmaddsub.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i32 8)
3921  %1 = bitcast i8 %__U to <8 x i1>
3922  %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__C
3923  ret <8 x double> %2
3924}
3925
3926define <8 x double> @test_mm512_maskz_fmaddsub_round_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
3927; X86-LABEL: test_mm512_maskz_fmaddsub_round_pd:
3928; X86:       # %bb.0: # %entry
3929; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
3930; X86-NEXT:    kmovw %eax, %k1
3931; X86-NEXT:    vfmaddsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
3932; X86-NEXT:    retl
3933;
3934; X64-LABEL: test_mm512_maskz_fmaddsub_round_pd:
3935; X64:       # %bb.0: # %entry
3936; X64-NEXT:    kmovw %edi, %k1
3937; X64-NEXT:    vfmaddsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
3938; X64-NEXT:    retq
3939entry:
3940  %0 = tail call <8 x double> @llvm.x86.avx512.vfmaddsub.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i32 8)
3941  %1 = bitcast i8 %__U to <8 x i1>
3942  %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer
3943  ret <8 x double> %2
3944}
3945
3946define <8 x double> @test_mm512_fmsubadd_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
3947; X86-LABEL: test_mm512_fmsubadd_round_pd:
3948; X86:       # %bb.0: # %entry
3949; X86-NEXT:    vpxorq {{\.LCPI.*}}{1to8}, %zmm2, %zmm2
3950; X86-NEXT:    vfmaddsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0
3951; X86-NEXT:    retl
3952;
3953; X64-LABEL: test_mm512_fmsubadd_round_pd:
3954; X64:       # %bb.0: # %entry
3955; X64-NEXT:    vpxorq {{.*}}(%rip){1to8}, %zmm2, %zmm2
3956; X64-NEXT:    vfmaddsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0
3957; X64-NEXT:    retq
3958entry:
3959  %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
3960  %0 = tail call <8 x double> @llvm.x86.avx512.vfmaddsub.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub, i32 8)
3961  ret <8 x double> %0
3962}
3963
3964define <8 x double> @test_mm512_mask_fmsubadd_round_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) {
3965; X86-LABEL: test_mm512_mask_fmsubadd_round_pd:
3966; X86:       # %bb.0: # %entry
3967; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
3968; X86-NEXT:    kmovw %eax, %k1
3969; X86-NEXT:    vfmsubadd132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
3970; X86-NEXT:    retl
3971;
3972; X64-LABEL: test_mm512_mask_fmsubadd_round_pd:
3973; X64:       # %bb.0: # %entry
3974; X64-NEXT:    kmovw %edi, %k1
3975; X64-NEXT:    vfmsubadd132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
3976; X64-NEXT:    retq
3977entry:
3978  %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
3979  %0 = tail call <8 x double> @llvm.x86.avx512.vfmaddsub.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub, i32 8)
3980  %1 = bitcast i8 %__U to <8 x i1>
3981  %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__A
3982  ret <8 x double> %2
3983}
3984
3985define <8 x double> @test_mm512_maskz_fmsubadd_round_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
3986; X86-LABEL: test_mm512_maskz_fmsubadd_round_pd:
3987; X86:       # %bb.0: # %entry
3988; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
3989; X86-NEXT:    kmovw %eax, %k1
3990; X86-NEXT:    vfmsubadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
3991; X86-NEXT:    retl
3992;
3993; X64-LABEL: test_mm512_maskz_fmsubadd_round_pd:
3994; X64:       # %bb.0: # %entry
3995; X64-NEXT:    kmovw %edi, %k1
3996; X64-NEXT:    vfmsubadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
3997; X64-NEXT:    retq
3998entry:
3999  %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
4000  %0 = tail call <8 x double> @llvm.x86.avx512.vfmaddsub.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub, i32 8)
4001  %1 = bitcast i8 %__U to <8 x i1>
4002  %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer
4003  ret <8 x double> %2
4004}
4005
4006define <8 x double> @test_mm512_fmaddsub_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
4007; CHECK-LABEL: test_mm512_fmaddsub_pd:
4008; CHECK:       # %bb.0: # %entry
4009; CHECK-NEXT:    vfmaddsub213pd {{.*#+}} zmm0 = (zmm1 * zmm0) +/- zmm2
4010; CHECK-NEXT:    ret{{[l|q]}}
4011entry:
4012  %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) #10
4013  %1 = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
4014  %2 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %1) #10
4015  %3 = shufflevector <8 x double> %2, <8 x double> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
4016  ret <8 x double> %3
4017}
4018
4019define <8 x double> @test_mm512_mask_fmaddsub_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) {
4020; X86-LABEL: test_mm512_mask_fmaddsub_pd:
4021; X86:       # %bb.0: # %entry
4022; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
4023; X86-NEXT:    kmovw %eax, %k1
4024; X86-NEXT:    vfmaddsub132pd {{.*#+}} zmm0 = (zmm0 * zmm1) +/- zmm2
4025; X86-NEXT:    retl
4026;
4027; X64-LABEL: test_mm512_mask_fmaddsub_pd:
4028; X64:       # %bb.0: # %entry
4029; X64-NEXT:    kmovw %edi, %k1
4030; X64-NEXT:    vfmaddsub132pd {{.*#+}} zmm0 = (zmm0 * zmm1) +/- zmm2
4031; X64-NEXT:    retq
4032entry:
4033  %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) #10
4034  %1 = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
4035  %2 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %1) #10
4036  %3 = shufflevector <8 x double> %2, <8 x double> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
4037  %4 = bitcast i8 %__U to <8 x i1>
4038  %5 = select <8 x i1> %4, <8 x double> %3, <8 x double> %__A
4039  ret <8 x double> %5
4040}
4041
4042define <8 x double> @test_mm512_mask3_fmaddsub_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) {
4043; X86-LABEL: test_mm512_mask3_fmaddsub_pd:
4044; X86:       # %bb.0: # %entry
4045; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
4046; X86-NEXT:    kmovw %eax, %k1
4047; X86-NEXT:    vfmaddsub231pd {{.*#+}} zmm2 = (zmm0 * zmm1) +/- zmm2
4048; X86-NEXT:    vmovapd %zmm2, %zmm0
4049; X86-NEXT:    retl
4050;
4051; X64-LABEL: test_mm512_mask3_fmaddsub_pd:
4052; X64:       # %bb.0: # %entry
4053; X64-NEXT:    kmovw %edi, %k1
4054; X64-NEXT:    vfmaddsub231pd {{.*#+}} zmm2 = (zmm0 * zmm1) +/- zmm2
4055; X64-NEXT:    vmovapd %zmm2, %zmm0
4056; X64-NEXT:    retq
4057entry:
4058  %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) #10
4059  %1 = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
4060  %2 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %1) #10
4061  %3 = shufflevector <8 x double> %2, <8 x double> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
4062  %4 = bitcast i8 %__U to <8 x i1>
4063  %5 = select <8 x i1> %4, <8 x double> %3, <8 x double> %__C
4064  ret <8 x double> %5
4065}
4066
4067define <8 x double> @test_mm512_maskz_fmaddsub_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
4068; X86-LABEL: test_mm512_maskz_fmaddsub_pd:
4069; X86:       # %bb.0: # %entry
4070; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
4071; X86-NEXT:    kmovw %eax, %k1
4072; X86-NEXT:    vfmaddsub213pd {{.*#+}} zmm0 = (zmm1 * zmm0) +/- zmm2
4073; X86-NEXT:    retl
4074;
4075; X64-LABEL: test_mm512_maskz_fmaddsub_pd:
4076; X64:       # %bb.0: # %entry
4077; X64-NEXT:    kmovw %edi, %k1
4078; X64-NEXT:    vfmaddsub213pd {{.*#+}} zmm0 = (zmm1 * zmm0) +/- zmm2
4079; X64-NEXT:    retq
4080entry:
4081  %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) #10
4082  %1 = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
4083  %2 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %1) #10
4084  %3 = shufflevector <8 x double> %2, <8 x double> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
4085  %4 = bitcast i8 %__U to <8 x i1>
4086  %5 = select <8 x i1> %4, <8 x double> %3, <8 x double> zeroinitializer
4087  ret <8 x double> %5
4088}
4089
4090define <8 x double> @test_mm512_fmsubadd_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
4091; CHECK-LABEL: test_mm512_fmsubadd_pd:
4092; CHECK:       # %bb.0: # %entry
4093; CHECK-NEXT:    vfmsubadd213pd {{.*#+}} zmm0 = (zmm1 * zmm0) -/+ zmm2
4094; CHECK-NEXT:    ret{{[l|q]}}
4095entry:
4096  %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
4097  %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub.i) #10
4098  %1 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) #10
4099  %2 = shufflevector <8 x double> %1, <8 x double> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
4100  ret <8 x double> %2
4101}
4102
4103define <8 x double> @test_mm512_mask_fmsubadd_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) {
4104; X86-LABEL: test_mm512_mask_fmsubadd_pd:
4105; X86:       # %bb.0: # %entry
4106; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
4107; X86-NEXT:    kmovw %eax, %k1
4108; X86-NEXT:    vfmsubadd132pd {{.*#+}} zmm0 = (zmm0 * zmm1) -/+ zmm2
4109; X86-NEXT:    retl
4110;
4111; X64-LABEL: test_mm512_mask_fmsubadd_pd:
4112; X64:       # %bb.0: # %entry
4113; X64-NEXT:    kmovw %edi, %k1
4114; X64-NEXT:    vfmsubadd132pd {{.*#+}} zmm0 = (zmm0 * zmm1) -/+ zmm2
4115; X64-NEXT:    retq
4116entry:
4117  %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
4118  %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub.i) #10
4119  %1 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) #10
4120  %2 = shufflevector <8 x double> %1, <8 x double> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
4121  %3 = bitcast i8 %__U to <8 x i1>
4122  %4 = select <8 x i1> %3, <8 x double> %2, <8 x double> %__A
4123  ret <8 x double> %4
4124}
4125
4126define <8 x double> @test_mm512_maskz_fmsubadd_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
4127; X86-LABEL: test_mm512_maskz_fmsubadd_pd:
4128; X86:       # %bb.0: # %entry
4129; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
4130; X86-NEXT:    kmovw %eax, %k1
4131; X86-NEXT:    vfmsubadd213pd {{.*#+}} zmm0 = (zmm1 * zmm0) -/+ zmm2
4132; X86-NEXT:    retl
4133;
4134; X64-LABEL: test_mm512_maskz_fmsubadd_pd:
4135; X64:       # %bb.0: # %entry
4136; X64-NEXT:    kmovw %edi, %k1
4137; X64-NEXT:    vfmsubadd213pd {{.*#+}} zmm0 = (zmm1 * zmm0) -/+ zmm2
4138; X64-NEXT:    retq
4139entry:
4140  %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
4141  %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub.i) #10
4142  %1 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) #10
4143  %2 = shufflevector <8 x double> %1, <8 x double> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
4144  %3 = bitcast i8 %__U to <8 x i1>
4145  %4 = select <8 x i1> %3, <8 x double> %2, <8 x double> zeroinitializer
4146  ret <8 x double> %4
4147}
4148
4149define <16 x float> @test_mm512_fmaddsub_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
4150; CHECK-LABEL: test_mm512_fmaddsub_round_ps:
4151; CHECK:       # %bb.0: # %entry
4152; CHECK-NEXT:    vfmaddsub213ps {rn-sae}, %zmm2, %zmm1, %zmm0
4153; CHECK-NEXT:    ret{{[l|q]}}
4154entry:
4155  %0 = tail call <16 x float> @llvm.x86.avx512.vfmaddsub.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i32 8)
4156  ret <16 x float> %0
4157}
4158
4159declare <16 x float> @llvm.x86.avx512.vfmaddsub.ps.512(<16 x float>, <16 x float>, <16 x float>, i32) #1
4160
4161define <16 x float> @test_mm512_mask_fmaddsub_round_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) {
4162; X86-LABEL: test_mm512_mask_fmaddsub_round_ps:
4163; X86:       # %bb.0: # %entry
4164; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
4165; X86-NEXT:    vfmaddsub132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
4166; X86-NEXT:    retl
4167;
4168; X64-LABEL: test_mm512_mask_fmaddsub_round_ps:
4169; X64:       # %bb.0: # %entry
4170; X64-NEXT:    kmovw %edi, %k1
4171; X64-NEXT:    vfmaddsub132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
4172; X64-NEXT:    retq
4173entry:
4174  %0 = tail call <16 x float> @llvm.x86.avx512.vfmaddsub.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i32 8)
4175  %1 = bitcast i16 %__U to <16 x i1>
4176  %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__A
4177  ret <16 x float> %2
4178}
4179
4180define <16 x float> @test_mm512_mask3_fmaddsub_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) {
4181; X86-LABEL: test_mm512_mask3_fmaddsub_round_ps:
4182; X86:       # %bb.0: # %entry
4183; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
4184; X86-NEXT:    vfmaddsub231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
4185; X86-NEXT:    vmovaps %zmm2, %zmm0
4186; X86-NEXT:    retl
4187;
4188; X64-LABEL: test_mm512_mask3_fmaddsub_round_ps:
4189; X64:       # %bb.0: # %entry
4190; X64-NEXT:    kmovw %edi, %k1
4191; X64-NEXT:    vfmaddsub231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
4192; X64-NEXT:    vmovaps %zmm2, %zmm0
4193; X64-NEXT:    retq
4194entry:
4195  %0 = tail call <16 x float> @llvm.x86.avx512.vfmaddsub.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i32 8)
4196  %1 = bitcast i16 %__U to <16 x i1>
4197  %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__C
4198  ret <16 x float> %2
4199}
4200
4201define <16 x float> @test_mm512_maskz_fmaddsub_round_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
4202; X86-LABEL: test_mm512_maskz_fmaddsub_round_ps:
4203; X86:       # %bb.0: # %entry
4204; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
4205; X86-NEXT:    vfmaddsub213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
4206; X86-NEXT:    retl
4207;
4208; X64-LABEL: test_mm512_maskz_fmaddsub_round_ps:
4209; X64:       # %bb.0: # %entry
4210; X64-NEXT:    kmovw %edi, %k1
4211; X64-NEXT:    vfmaddsub213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
4212; X64-NEXT:    retq
4213entry:
4214  %0 = tail call <16 x float> @llvm.x86.avx512.vfmaddsub.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i32 8)
4215  %1 = bitcast i16 %__U to <16 x i1>
4216  %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer
4217  ret <16 x float> %2
4218}
4219
4220define <16 x float> @test_mm512_fmsubadd_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
4221; X86-LABEL: test_mm512_fmsubadd_round_ps:
4222; X86:       # %bb.0: # %entry
4223; X86-NEXT:    vpxord {{\.LCPI.*}}{1to16}, %zmm2, %zmm2
4224; X86-NEXT:    vfmaddsub213ps {rn-sae}, %zmm2, %zmm1, %zmm0
4225; X86-NEXT:    retl
4226;
4227; X64-LABEL: test_mm512_fmsubadd_round_ps:
4228; X64:       # %bb.0: # %entry
4229; X64-NEXT:    vpxord {{.*}}(%rip){1to16}, %zmm2, %zmm2
4230; X64-NEXT:    vfmaddsub213ps {rn-sae}, %zmm2, %zmm1, %zmm0
4231; X64-NEXT:    retq
4232entry:
4233  %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4234  %0 = tail call <16 x float> @llvm.x86.avx512.vfmaddsub.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub, i32 8)
4235  ret <16 x float> %0
4236}
4237
4238define <16 x float> @test_mm512_mask_fmsubadd_round_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) {
4239; X86-LABEL: test_mm512_mask_fmsubadd_round_ps:
4240; X86:       # %bb.0: # %entry
4241; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
4242; X86-NEXT:    vfmsubadd132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
4243; X86-NEXT:    retl
4244;
4245; X64-LABEL: test_mm512_mask_fmsubadd_round_ps:
4246; X64:       # %bb.0: # %entry
4247; X64-NEXT:    kmovw %edi, %k1
4248; X64-NEXT:    vfmsubadd132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
4249; X64-NEXT:    retq
4250entry:
4251  %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4252  %0 = tail call <16 x float> @llvm.x86.avx512.vfmaddsub.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub, i32 8)
4253  %1 = bitcast i16 %__U to <16 x i1>
4254  %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__A
4255  ret <16 x float> %2
4256}
4257
4258define <16 x float> @test_mm512_maskz_fmsubadd_round_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
4259; X86-LABEL: test_mm512_maskz_fmsubadd_round_ps:
4260; X86:       # %bb.0: # %entry
4261; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
4262; X86-NEXT:    vfmsubadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
4263; X86-NEXT:    retl
4264;
4265; X64-LABEL: test_mm512_maskz_fmsubadd_round_ps:
4266; X64:       # %bb.0: # %entry
4267; X64-NEXT:    kmovw %edi, %k1
4268; X64-NEXT:    vfmsubadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
4269; X64-NEXT:    retq
4270entry:
4271  %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4272  %0 = tail call <16 x float> @llvm.x86.avx512.vfmaddsub.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub, i32 8)
4273  %1 = bitcast i16 %__U to <16 x i1>
4274  %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer
4275  ret <16 x float> %2
4276}
4277
4278define <16 x float> @test_mm512_fmaddsub_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
4279; CHECK-LABEL: test_mm512_fmaddsub_ps:
4280; CHECK:       # %bb.0: # %entry
4281; CHECK-NEXT:    vfmaddsub213ps {{.*#+}} zmm0 = (zmm1 * zmm0) +/- zmm2
4282; CHECK-NEXT:    ret{{[l|q]}}
4283entry:
4284  %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) #10
4285  %1 = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4286  %2 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %1) #10
4287  %3 = shufflevector <16 x float> %2, <16 x float> %0, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
4288  ret <16 x float> %3
4289}
4290
4291define <16 x float> @test_mm512_mask_fmaddsub_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) {
4292; X86-LABEL: test_mm512_mask_fmaddsub_ps:
4293; X86:       # %bb.0: # %entry
4294; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
4295; X86-NEXT:    vfmaddsub132ps {{.*#+}} zmm0 = (zmm0 * zmm1) +/- zmm2
4296; X86-NEXT:    retl
4297;
4298; X64-LABEL: test_mm512_mask_fmaddsub_ps:
4299; X64:       # %bb.0: # %entry
4300; X64-NEXT:    kmovw %edi, %k1
4301; X64-NEXT:    vfmaddsub132ps {{.*#+}} zmm0 = (zmm0 * zmm1) +/- zmm2
4302; X64-NEXT:    retq
4303entry:
4304  %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) #10
4305  %1 = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4306  %2 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %1) #10
4307  %3 = shufflevector <16 x float> %2, <16 x float> %0, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
4308  %4 = bitcast i16 %__U to <16 x i1>
4309  %5 = select <16 x i1> %4, <16 x float> %3, <16 x float> %__A
4310  ret <16 x float> %5
4311}
4312
4313define <16 x float> @test_mm512_mask3_fmaddsub_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) {
4314; X86-LABEL: test_mm512_mask3_fmaddsub_ps:
4315; X86:       # %bb.0: # %entry
4316; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
4317; X86-NEXT:    vfmaddsub231ps {{.*#+}} zmm2 = (zmm0 * zmm1) +/- zmm2
4318; X86-NEXT:    vmovaps %zmm2, %zmm0
4319; X86-NEXT:    retl
4320;
4321; X64-LABEL: test_mm512_mask3_fmaddsub_ps:
4322; X64:       # %bb.0: # %entry
4323; X64-NEXT:    kmovw %edi, %k1
4324; X64-NEXT:    vfmaddsub231ps {{.*#+}} zmm2 = (zmm0 * zmm1) +/- zmm2
4325; X64-NEXT:    vmovaps %zmm2, %zmm0
4326; X64-NEXT:    retq
4327entry:
4328  %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) #10
4329  %1 = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4330  %2 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %1) #10
4331  %3 = shufflevector <16 x float> %2, <16 x float> %0, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
4332  %4 = bitcast i16 %__U to <16 x i1>
4333  %5 = select <16 x i1> %4, <16 x float> %3, <16 x float> %__C
4334  ret <16 x float> %5
4335}
4336
4337define <16 x float> @test_mm512_maskz_fmaddsub_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
4338; X86-LABEL: test_mm512_maskz_fmaddsub_ps:
4339; X86:       # %bb.0: # %entry
4340; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
4341; X86-NEXT:    vfmaddsub213ps {{.*#+}} zmm0 = (zmm1 * zmm0) +/- zmm2
4342; X86-NEXT:    retl
4343;
4344; X64-LABEL: test_mm512_maskz_fmaddsub_ps:
4345; X64:       # %bb.0: # %entry
4346; X64-NEXT:    kmovw %edi, %k1
4347; X64-NEXT:    vfmaddsub213ps {{.*#+}} zmm0 = (zmm1 * zmm0) +/- zmm2
4348; X64-NEXT:    retq
4349entry:
4350  %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) #10
4351  %1 = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4352  %2 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %1) #10
4353  %3 = shufflevector <16 x float> %2, <16 x float> %0, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
4354  %4 = bitcast i16 %__U to <16 x i1>
4355  %5 = select <16 x i1> %4, <16 x float> %3, <16 x float> zeroinitializer
4356  ret <16 x float> %5
4357}
4358
4359define <16 x float> @test_mm512_fmsubadd_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
4360; CHECK-LABEL: test_mm512_fmsubadd_ps:
4361; CHECK:       # %bb.0: # %entry
4362; CHECK-NEXT:    vfmsubadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) -/+ zmm2
4363; CHECK-NEXT:    ret{{[l|q]}}
4364entry:
4365  %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4366  %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub.i) #10
4367  %1 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) #10
4368  %2 = shufflevector <16 x float> %1, <16 x float> %0, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
4369  ret <16 x float> %2
4370}
4371
4372define <16 x float> @test_mm512_mask_fmsubadd_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) {
4373; X86-LABEL: test_mm512_mask_fmsubadd_ps:
4374; X86:       # %bb.0: # %entry
4375; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
4376; X86-NEXT:    vfmsubadd132ps {{.*#+}} zmm0 = (zmm0 * zmm1) -/+ zmm2
4377; X86-NEXT:    retl
4378;
4379; X64-LABEL: test_mm512_mask_fmsubadd_ps:
4380; X64:       # %bb.0: # %entry
4381; X64-NEXT:    kmovw %edi, %k1
4382; X64-NEXT:    vfmsubadd132ps {{.*#+}} zmm0 = (zmm0 * zmm1) -/+ zmm2
4383; X64-NEXT:    retq
4384entry:
4385  %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4386  %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub.i) #10
4387  %1 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) #10
4388  %2 = shufflevector <16 x float> %1, <16 x float> %0, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
4389  %3 = bitcast i16 %__U to <16 x i1>
4390  %4 = select <16 x i1> %3, <16 x float> %2, <16 x float> %__A
4391  ret <16 x float> %4
4392}
4393
4394define <16 x float> @test_mm512_maskz_fmsubadd_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
4395; X86-LABEL: test_mm512_maskz_fmsubadd_ps:
4396; X86:       # %bb.0: # %entry
4397; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
4398; X86-NEXT:    vfmsubadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) -/+ zmm2
4399; X86-NEXT:    retl
4400;
4401; X64-LABEL: test_mm512_maskz_fmsubadd_ps:
4402; X64:       # %bb.0: # %entry
4403; X64-NEXT:    kmovw %edi, %k1
4404; X64-NEXT:    vfmsubadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) -/+ zmm2
4405; X64-NEXT:    retq
4406entry:
4407  %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4408  %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub.i) #10
4409  %1 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) #10
4410  %2 = shufflevector <16 x float> %1, <16 x float> %0, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
4411  %3 = bitcast i16 %__U to <16 x i1>
4412  %4 = select <16 x i1> %3, <16 x float> %2, <16 x float> zeroinitializer
4413  ret <16 x float> %4
4414}
4415
4416define <8 x double> @test_mm512_mask3_fmsub_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) {
4417; X86-LABEL: test_mm512_mask3_fmsub_round_pd:
4418; X86:       # %bb.0: # %entry
4419; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
4420; X86-NEXT:    kmovw %eax, %k1
4421; X86-NEXT:    vfmsub231pd {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
4422; X86-NEXT:    vmovapd %zmm2, %zmm0
4423; X86-NEXT:    retl
4424;
4425; X64-LABEL: test_mm512_mask3_fmsub_round_pd:
4426; X64:       # %bb.0: # %entry
4427; X64-NEXT:    kmovw %edi, %k1
4428; X64-NEXT:    vfmsub231pd {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
4429; X64-NEXT:    vmovapd %zmm2, %zmm0
4430; X64-NEXT:    retq
4431entry:
4432  %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
4433  %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub, i32 8)
4434  %1 = bitcast i8 %__U to <8 x i1>
4435  %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__C
4436  ret <8 x double> %2
4437}
4438
4439define <8 x double> @test_mm512_mask3_fmsub_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) {
4440; X86-LABEL: test_mm512_mask3_fmsub_pd:
4441; X86:       # %bb.0: # %entry
4442; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
4443; X86-NEXT:    kmovw %eax, %k1
4444; X86-NEXT:    vfmsub231pd {{.*#+}} zmm2 = (zmm0 * zmm1) - zmm2
4445; X86-NEXT:    vmovapd %zmm2, %zmm0
4446; X86-NEXT:    retl
4447;
4448; X64-LABEL: test_mm512_mask3_fmsub_pd:
4449; X64:       # %bb.0: # %entry
4450; X64-NEXT:    kmovw %edi, %k1
4451; X64-NEXT:    vfmsub231pd {{.*#+}} zmm2 = (zmm0 * zmm1) - zmm2
4452; X64-NEXT:    vmovapd %zmm2, %zmm0
4453; X64-NEXT:    retq
4454entry:
4455  %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
4456  %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub.i) #10
4457  %1 = bitcast i8 %__U to <8 x i1>
4458  %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__C
4459  ret <8 x double> %2
4460}
4461
4462define <16 x float> @test_mm512_mask3_fmsub_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) {
4463; X86-LABEL: test_mm512_mask3_fmsub_round_ps:
4464; X86:       # %bb.0: # %entry
4465; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
4466; X86-NEXT:    vfmsub231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
4467; X86-NEXT:    vmovaps %zmm2, %zmm0
4468; X86-NEXT:    retl
4469;
4470; X64-LABEL: test_mm512_mask3_fmsub_round_ps:
4471; X64:       # %bb.0: # %entry
4472; X64-NEXT:    kmovw %edi, %k1
4473; X64-NEXT:    vfmsub231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
4474; X64-NEXT:    vmovaps %zmm2, %zmm0
4475; X64-NEXT:    retq
4476entry:
4477  %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4478  %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub, i32 8)
4479  %1 = bitcast i16 %__U to <16 x i1>
4480  %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__C
4481  ret <16 x float> %2
4482}
4483
4484define <16 x float> @test_mm512_mask3_fmsub_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) {
4485; X86-LABEL: test_mm512_mask3_fmsub_ps:
4486; X86:       # %bb.0: # %entry
4487; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
4488; X86-NEXT:    vfmsub231ps {{.*#+}} zmm2 = (zmm0 * zmm1) - zmm2
4489; X86-NEXT:    vmovaps %zmm2, %zmm0
4490; X86-NEXT:    retl
4491;
4492; X64-LABEL: test_mm512_mask3_fmsub_ps:
4493; X64:       # %bb.0: # %entry
4494; X64-NEXT:    kmovw %edi, %k1
4495; X64-NEXT:    vfmsub231ps {{.*#+}} zmm2 = (zmm0 * zmm1) - zmm2
4496; X64-NEXT:    vmovaps %zmm2, %zmm0
4497; X64-NEXT:    retq
4498entry:
4499  %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4500  %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub.i) #10
4501  %1 = bitcast i16 %__U to <16 x i1>
4502  %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__C
4503  ret <16 x float> %2
4504}
4505
4506define <8 x double> @test_mm512_mask3_fmsubadd_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) {
4507; X86-LABEL: test_mm512_mask3_fmsubadd_round_pd:
4508; X86:       # %bb.0: # %entry
4509; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
4510; X86-NEXT:    kmovw %eax, %k1
4511; X86-NEXT:    vfmsubadd231pd {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
4512; X86-NEXT:    vmovapd %zmm2, %zmm0
4513; X86-NEXT:    retl
4514;
4515; X64-LABEL: test_mm512_mask3_fmsubadd_round_pd:
4516; X64:       # %bb.0: # %entry
4517; X64-NEXT:    kmovw %edi, %k1
4518; X64-NEXT:    vfmsubadd231pd {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
4519; X64-NEXT:    vmovapd %zmm2, %zmm0
4520; X64-NEXT:    retq
4521entry:
4522  %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
4523  %0 = tail call <8 x double> @llvm.x86.avx512.vfmaddsub.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub, i32 8)
4524  %1 = bitcast i8 %__U to <8 x i1>
4525  %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__C
4526  ret <8 x double> %2
4527}
4528
4529define <8 x double> @test_mm512_mask3_fmsubadd_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) {
4530; X86-LABEL: test_mm512_mask3_fmsubadd_pd:
4531; X86:       # %bb.0: # %entry
4532; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
4533; X86-NEXT:    kmovw %eax, %k1
4534; X86-NEXT:    vfmsubadd231pd {{.*#+}} zmm2 = (zmm0 * zmm1) -/+ zmm2
4535; X86-NEXT:    vmovapd %zmm2, %zmm0
4536; X86-NEXT:    retl
4537;
4538; X64-LABEL: test_mm512_mask3_fmsubadd_pd:
4539; X64:       # %bb.0: # %entry
4540; X64-NEXT:    kmovw %edi, %k1
4541; X64-NEXT:    vfmsubadd231pd {{.*#+}} zmm2 = (zmm0 * zmm1) -/+ zmm2
4542; X64-NEXT:    vmovapd %zmm2, %zmm0
4543; X64-NEXT:    retq
4544entry:
4545  %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
4546  %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub.i) #10
4547  %1 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) #10
4548  %2 = shufflevector <8 x double> %1, <8 x double> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
4549  %3 = bitcast i8 %__U to <8 x i1>
4550  %4 = select <8 x i1> %3, <8 x double> %2, <8 x double> %__C
4551  ret <8 x double> %4
4552}
4553
4554define <16 x float> @test_mm512_mask3_fmsubadd_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) {
4555; X86-LABEL: test_mm512_mask3_fmsubadd_round_ps:
4556; X86:       # %bb.0: # %entry
4557; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
4558; X86-NEXT:    vfmsubadd231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
4559; X86-NEXT:    vmovaps %zmm2, %zmm0
4560; X86-NEXT:    retl
4561;
4562; X64-LABEL: test_mm512_mask3_fmsubadd_round_ps:
4563; X64:       # %bb.0: # %entry
4564; X64-NEXT:    kmovw %edi, %k1
4565; X64-NEXT:    vfmsubadd231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
4566; X64-NEXT:    vmovaps %zmm2, %zmm0
4567; X64-NEXT:    retq
4568entry:
4569  %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4570  %0 = tail call <16 x float> @llvm.x86.avx512.vfmaddsub.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub, i32 8)
4571  %1 = bitcast i16 %__U to <16 x i1>
4572  %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__C
4573  ret <16 x float> %2
4574}
4575
4576define <16 x float> @test_mm512_mask3_fmsubadd_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) {
4577; X86-LABEL: test_mm512_mask3_fmsubadd_ps:
4578; X86:       # %bb.0: # %entry
4579; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
4580; X86-NEXT:    vfmsubadd231ps {{.*#+}} zmm2 = (zmm0 * zmm1) -/+ zmm2
4581; X86-NEXT:    vmovaps %zmm2, %zmm0
4582; X86-NEXT:    retl
4583;
4584; X64-LABEL: test_mm512_mask3_fmsubadd_ps:
4585; X64:       # %bb.0: # %entry
4586; X64-NEXT:    kmovw %edi, %k1
4587; X64-NEXT:    vfmsubadd231ps {{.*#+}} zmm2 = (zmm0 * zmm1) -/+ zmm2
4588; X64-NEXT:    vmovaps %zmm2, %zmm0
4589; X64-NEXT:    retq
4590entry:
4591  %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4592  %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub.i) #10
4593  %1 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) #10
4594  %2 = shufflevector <16 x float> %1, <16 x float> %0, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
4595  %3 = bitcast i16 %__U to <16 x i1>
4596  %4 = select <16 x i1> %3, <16 x float> %2, <16 x float> %__C
4597  ret <16 x float> %4
4598}
4599
4600define <8 x double> @test_mm512_mask_fnmadd_round_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) {
4601; X86-LABEL: test_mm512_mask_fnmadd_round_pd:
4602; X86:       # %bb.0: # %entry
4603; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
4604; X86-NEXT:    kmovw %eax, %k1
4605; X86-NEXT:    vfnmadd132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
4606; X86-NEXT:    retl
4607;
4608; X64-LABEL: test_mm512_mask_fnmadd_round_pd:
4609; X64:       # %bb.0: # %entry
4610; X64-NEXT:    kmovw %edi, %k1
4611; X64-NEXT:    vfnmadd132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
4612; X64-NEXT:    retq
4613entry:
4614  %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A
4615  %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %sub, <8 x double> %__B, <8 x double> %__C, i32 8)
4616  %1 = bitcast i8 %__U to <8 x i1>
4617  %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__A
4618  ret <8 x double> %2
4619}
4620
4621define <8 x double> @test_mm512_mask_fnmadd_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) {
4622; X86-LABEL: test_mm512_mask_fnmadd_pd:
4623; X86:       # %bb.0: # %entry
4624; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
4625; X86-NEXT:    kmovw %eax, %k1
4626; X86-NEXT:    vfnmadd132pd {{.*#+}} zmm0 = -(zmm0 * zmm1) + zmm2
4627; X86-NEXT:    retl
4628;
4629; X64-LABEL: test_mm512_mask_fnmadd_pd:
4630; X64:       # %bb.0: # %entry
4631; X64-NEXT:    kmovw %edi, %k1
4632; X64-NEXT:    vfnmadd132pd {{.*#+}} zmm0 = -(zmm0 * zmm1) + zmm2
4633; X64-NEXT:    retq
4634entry:
4635  %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A
4636  %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %sub.i, <8 x double> %__B, <8 x double> %__C) #10
4637  %1 = bitcast i8 %__U to <8 x i1>
4638  %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__A
4639  ret <8 x double> %2
4640}
4641
4642define <16 x float> @test_mm512_mask_fnmadd_round_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) {
4643; X86-LABEL: test_mm512_mask_fnmadd_round_ps:
4644; X86:       # %bb.0: # %entry
4645; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
4646; X86-NEXT:    vfnmadd132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
4647; X86-NEXT:    retl
4648;
4649; X64-LABEL: test_mm512_mask_fnmadd_round_ps:
4650; X64:       # %bb.0: # %entry
4651; X64-NEXT:    kmovw %edi, %k1
4652; X64-NEXT:    vfnmadd132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
4653; X64-NEXT:    retq
4654entry:
4655  %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
4656  %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %sub, <16 x float> %__B, <16 x float> %__C, i32 8)
4657  %1 = bitcast i16 %__U to <16 x i1>
4658  %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__A
4659  ret <16 x float> %2
4660}
4661
4662define <16 x float> @test_mm512_mask_fnmadd_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) {
4663; X86-LABEL: test_mm512_mask_fnmadd_ps:
4664; X86:       # %bb.0: # %entry
4665; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
4666; X86-NEXT:    vfnmadd132ps {{.*#+}} zmm0 = -(zmm0 * zmm1) + zmm2
4667; X86-NEXT:    retl
4668;
4669; X64-LABEL: test_mm512_mask_fnmadd_ps:
4670; X64:       # %bb.0: # %entry
4671; X64-NEXT:    kmovw %edi, %k1
4672; X64-NEXT:    vfnmadd132ps {{.*#+}} zmm0 = -(zmm0 * zmm1) + zmm2
4673; X64-NEXT:    retq
4674entry:
4675  %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
4676  %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %sub.i, <16 x float> %__B, <16 x float> %__C) #10
4677  %1 = bitcast i16 %__U to <16 x i1>
4678  %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__A
4679  ret <16 x float> %2
4680}
4681
4682define <8 x double> @test_mm512_mask_fnmsub_round_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) {
4683; X86-LABEL: test_mm512_mask_fnmsub_round_pd:
4684; X86:       # %bb.0: # %entry
4685; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
4686; X86-NEXT:    kmovw %eax, %k1
4687; X86-NEXT:    vfnmsub132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
4688; X86-NEXT:    retl
4689;
4690; X64-LABEL: test_mm512_mask_fnmsub_round_pd:
4691; X64:       # %bb.0: # %entry
4692; X64-NEXT:    kmovw %edi, %k1
4693; X64-NEXT:    vfnmsub132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
4694; X64-NEXT:    retq
4695entry:
4696  %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__B
4697  %sub1 = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
4698  %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %__A, <8 x double> %sub, <8 x double> %sub1, i32 8)
4699  %1 = bitcast i8 %__U to <8 x i1>
4700  %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__A
4701  ret <8 x double> %2
4702}
4703
4704define <8 x double> @test_mm512_mask3_fnmsub_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) {
4705; X86-LABEL: test_mm512_mask3_fnmsub_round_pd:
4706; X86:       # %bb.0: # %entry
4707; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
4708; X86-NEXT:    kmovw %eax, %k1
4709; X86-NEXT:    vfnmsub231pd {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
4710; X86-NEXT:    vmovapd %zmm2, %zmm0
4711; X86-NEXT:    retl
4712;
4713; X64-LABEL: test_mm512_mask3_fnmsub_round_pd:
4714; X64:       # %bb.0: # %entry
4715; X64-NEXT:    kmovw %edi, %k1
4716; X64-NEXT:    vfnmsub231pd {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
4717; X64-NEXT:    vmovapd %zmm2, %zmm0
4718; X64-NEXT:    retq
4719entry:
4720  %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__B
4721  %sub1 = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
4722  %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %__A, <8 x double> %sub, <8 x double> %sub1, i32 8)
4723  %1 = bitcast i8 %__U to <8 x i1>
4724  %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__C
4725  ret <8 x double> %2
4726}
4727
4728define <8 x double> @test_mm512_mask_fnmsub_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) {
4729; X86-LABEL: test_mm512_mask_fnmsub_pd:
4730; X86:       # %bb.0: # %entry
4731; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
4732; X86-NEXT:    kmovw %eax, %k1
4733; X86-NEXT:    vfnmsub132pd {{.*#+}} zmm0 = -(zmm0 * zmm1) - zmm2
4734; X86-NEXT:    retl
4735;
4736; X64-LABEL: test_mm512_mask_fnmsub_pd:
4737; X64:       # %bb.0: # %entry
4738; X64-NEXT:    kmovw %edi, %k1
4739; X64-NEXT:    vfnmsub132pd {{.*#+}} zmm0 = -(zmm0 * zmm1) - zmm2
4740; X64-NEXT:    retq
4741entry:
4742  %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__B
4743  %sub2.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
4744  %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %sub.i, <8 x double> %sub2.i) #10
4745  %1 = bitcast i8 %__U to <8 x i1>
4746  %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__A
4747  ret <8 x double> %2
4748}
4749
4750define <8 x double> @test_mm512_mask3_fnmsub_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) {
4751; X86-LABEL: test_mm512_mask3_fnmsub_pd:
4752; X86:       # %bb.0: # %entry
4753; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
4754; X86-NEXT:    kmovw %eax, %k1
4755; X86-NEXT:    vfnmsub231pd {{.*#+}} zmm2 = -(zmm0 * zmm1) - zmm2
4756; X86-NEXT:    vmovapd %zmm2, %zmm0
4757; X86-NEXT:    retl
4758;
4759; X64-LABEL: test_mm512_mask3_fnmsub_pd:
4760; X64:       # %bb.0: # %entry
4761; X64-NEXT:    kmovw %edi, %k1
4762; X64-NEXT:    vfnmsub231pd {{.*#+}} zmm2 = -(zmm0 * zmm1) - zmm2
4763; X64-NEXT:    vmovapd %zmm2, %zmm0
4764; X64-NEXT:    retq
4765entry:
4766  %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__B
4767  %sub2.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
4768  %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %sub.i, <8 x double> %sub2.i) #10
4769  %1 = bitcast i8 %__U to <8 x i1>
4770  %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__C
4771  ret <8 x double> %2
4772}
4773
4774define <16 x float> @test_mm512_mask_fnmsub_round_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) {
4775; X86-LABEL: test_mm512_mask_fnmsub_round_ps:
4776; X86:       # %bb.0: # %entry
4777; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
4778; X86-NEXT:    vfnmsub132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
4779; X86-NEXT:    retl
4780;
4781; X64-LABEL: test_mm512_mask_fnmsub_round_ps:
4782; X64:       # %bb.0: # %entry
4783; X64-NEXT:    kmovw %edi, %k1
4784; X64-NEXT:    vfnmsub132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
4785; X64-NEXT:    retq
4786entry:
4787  %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__B
4788  %sub1 = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4789  %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %__A, <16 x float> %sub, <16 x float> %sub1, i32 8)
4790  %1 = bitcast i16 %__U to <16 x i1>
4791  %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__A
4792  ret <16 x float> %2
4793}
4794
4795define <16 x float> @test_mm512_mask3_fnmsub_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) {
4796; X86-LABEL: test_mm512_mask3_fnmsub_round_ps:
4797; X86:       # %bb.0: # %entry
4798; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
4799; X86-NEXT:    vfnmsub231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
4800; X86-NEXT:    vmovaps %zmm2, %zmm0
4801; X86-NEXT:    retl
4802;
4803; X64-LABEL: test_mm512_mask3_fnmsub_round_ps:
4804; X64:       # %bb.0: # %entry
4805; X64-NEXT:    kmovw %edi, %k1
4806; X64-NEXT:    vfnmsub231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
4807; X64-NEXT:    vmovaps %zmm2, %zmm0
4808; X64-NEXT:    retq
4809entry:
4810  %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__B
4811  %sub1 = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4812  %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %__A, <16 x float> %sub, <16 x float> %sub1, i32 8)
4813  %1 = bitcast i16 %__U to <16 x i1>
4814  %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__C
4815  ret <16 x float> %2
4816}
4817
4818define <16 x float> @test_mm512_mask_fnmsub_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) {
4819; X86-LABEL: test_mm512_mask_fnmsub_ps:
4820; X86:       # %bb.0: # %entry
4821; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
4822; X86-NEXT:    vfnmsub132ps {{.*#+}} zmm0 = -(zmm0 * zmm1) - zmm2
4823; X86-NEXT:    retl
4824;
4825; X64-LABEL: test_mm512_mask_fnmsub_ps:
4826; X64:       # %bb.0: # %entry
4827; X64-NEXT:    kmovw %edi, %k1
4828; X64-NEXT:    vfnmsub132ps {{.*#+}} zmm0 = -(zmm0 * zmm1) - zmm2
4829; X64-NEXT:    retq
4830entry:
4831  %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__B
4832  %sub1.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4833  %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %sub.i, <16 x float> %sub1.i) #10
4834  %1 = bitcast i16 %__U to <16 x i1>
4835  %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__A
4836  ret <16 x float> %2
4837}
4838
4839define <16 x float> @test_mm512_mask3_fnmsub_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) {
4840; X86-LABEL: test_mm512_mask3_fnmsub_ps:
4841; X86:       # %bb.0: # %entry
4842; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
4843; X86-NEXT:    vfnmsub231ps {{.*#+}} zmm2 = -(zmm0 * zmm1) - zmm2
4844; X86-NEXT:    vmovaps %zmm2, %zmm0
4845; X86-NEXT:    retl
4846;
4847; X64-LABEL: test_mm512_mask3_fnmsub_ps:
4848; X64:       # %bb.0: # %entry
4849; X64-NEXT:    kmovw %edi, %k1
4850; X64-NEXT:    vfnmsub231ps {{.*#+}} zmm2 = -(zmm0 * zmm1) - zmm2
4851; X64-NEXT:    vmovaps %zmm2, %zmm0
4852; X64-NEXT:    retq
4853entry:
4854  %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__B
4855  %sub1.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4856  %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %sub.i, <16 x float> %sub1.i) #10
4857  %1 = bitcast i16 %__U to <16 x i1>
4858  %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__C
4859  ret <16 x float> %2
4860}
4861
4862define <4 x float> @test_mm_mask_fmadd_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
4863; X86-LABEL: test_mm_mask_fmadd_ss:
4864; X86:       # %bb.0: # %entry
4865; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
4866; X86-NEXT:    kmovw %eax, %k1
4867; X86-NEXT:    vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
4868; X86-NEXT:    retl
4869;
4870; X64-LABEL: test_mm_mask_fmadd_ss:
4871; X64:       # %bb.0: # %entry
4872; X64-NEXT:    kmovw %edi, %k1
4873; X64-NEXT:    vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
4874; X64-NEXT:    retq
4875entry:
4876  %0 = extractelement <4 x float> %__W, i64 0
4877  %1 = extractelement <4 x float> %__A, i64 0
4878  %2 = extractelement <4 x float> %__B, i64 0
4879  %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #10
4880  %4 = and i8 %__U, 1
4881  %tobool.i = icmp eq i8 %4, 0
4882  %vecext1.i = extractelement <4 x float> %__W, i32 0
4883  %cond.i = select i1 %tobool.i, float %vecext1.i, float %3
4884  %vecins.i = insertelement <4 x float> %__W, float %cond.i, i32 0
4885  ret <4 x float> %vecins.i
4886}
4887
4888define <4 x float> @test_mm_mask_fmadd_round_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
4889; X86-LABEL: test_mm_mask_fmadd_round_ss:
4890; X86:       # %bb.0: # %entry
4891; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
4892; X86-NEXT:    kmovw %eax, %k1
4893; X86-NEXT:    vfmadd213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1}
4894; X86-NEXT:    retl
4895;
4896; X64-LABEL: test_mm_mask_fmadd_round_ss:
4897; X64:       # %bb.0: # %entry
4898; X64-NEXT:    kmovw %edi, %k1
4899; X64-NEXT:    vfmadd213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1}
4900; X64-NEXT:    retq
4901entry:
4902  %0 = extractelement <4 x float> %__W, i64 0
4903  %1 = extractelement <4 x float> %__A, i64 0
4904  %2 = extractelement <4 x float> %__B, i64 0
4905  %3 = tail call float @llvm.x86.avx512.vfmadd.f32(float %0, float %1, float %2, i32 8)
4906  %4 = bitcast i8 %__U to <8 x i1>
4907  %5 = extractelement <8 x i1> %4, i64 0
4908  %6 = select i1 %5, float %3, float %0
4909  %7 = insertelement <4 x float> %__W, float %6, i64 0
4910  ret <4 x float> %7
4911}
4912
4913declare float @llvm.x86.avx512.vfmadd.f32(float, float, float, i32) #1
4914
4915define <4 x float> @test_mm_maskz_fmadd_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) {
4916; X86-LABEL: test_mm_maskz_fmadd_ss:
4917; X86:       # %bb.0: # %entry
4918; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
4919; X86-NEXT:    kmovw %eax, %k1
4920; X86-NEXT:    vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
4921; X86-NEXT:    retl
4922;
4923; X64-LABEL: test_mm_maskz_fmadd_ss:
4924; X64:       # %bb.0: # %entry
4925; X64-NEXT:    kmovw %edi, %k1
4926; X64-NEXT:    vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
4927; X64-NEXT:    retq
4928entry:
4929  %0 = extractelement <4 x float> %__A, i64 0
4930  %1 = extractelement <4 x float> %__B, i64 0
4931  %2 = extractelement <4 x float> %__C, i64 0
4932  %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #10
4933  %4 = and i8 %__U, 1
4934  %tobool.i = icmp eq i8 %4, 0
4935  %cond.i = select i1 %tobool.i, float 0.000000e+00, float %3
4936  %vecins.i = insertelement <4 x float> %__A, float %cond.i, i32 0
4937  ret <4 x float> %vecins.i
4938}
4939
4940define <4 x float> @test_mm_maskz_fmadd_round_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) {
4941; X86-LABEL: test_mm_maskz_fmadd_round_ss:
4942; X86:       # %bb.0: # %entry
4943; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
4944; X86-NEXT:    kmovw %eax, %k1
4945; X86-NEXT:    vfmadd213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
4946; X86-NEXT:    retl
4947;
4948; X64-LABEL: test_mm_maskz_fmadd_round_ss:
4949; X64:       # %bb.0: # %entry
4950; X64-NEXT:    kmovw %edi, %k1
4951; X64-NEXT:    vfmadd213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
4952; X64-NEXT:    retq
4953entry:
4954  %0 = extractelement <4 x float> %__A, i64 0
4955  %1 = extractelement <4 x float> %__B, i64 0
4956  %2 = extractelement <4 x float> %__C, i64 0
4957  %3 = tail call float @llvm.x86.avx512.vfmadd.f32(float %0, float %1, float %2, i32 8)
4958  %4 = bitcast i8 %__U to <8 x i1>
4959  %5 = extractelement <8 x i1> %4, i64 0
4960  %6 = select i1 %5, float %3, float 0.000000e+00
4961  %7 = insertelement <4 x float> %__A, float %6, i64 0
4962  ret <4 x float> %7
4963}
4964
4965define <4 x float> @test_mm_mask3_fmadd_ss(<4 x float> %__W, <4 x float> %__X, <4 x float> %__Y, i8 zeroext %__U) {
4966; X86-LABEL: test_mm_mask3_fmadd_ss:
4967; X86:       # %bb.0: # %entry
4968; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
4969; X86-NEXT:    kmovw %eax, %k1
4970; X86-NEXT:    vfmadd231ss {{.*#+}} xmm2 = (xmm0 * xmm1) + xmm2
4971; X86-NEXT:    vmovaps %xmm2, %xmm0
4972; X86-NEXT:    retl
4973;
4974; X64-LABEL: test_mm_mask3_fmadd_ss:
4975; X64:       # %bb.0: # %entry
4976; X64-NEXT:    kmovw %edi, %k1
4977; X64-NEXT:    vfmadd231ss {{.*#+}} xmm2 = (xmm0 * xmm1) + xmm2
4978; X64-NEXT:    vmovaps %xmm2, %xmm0
4979; X64-NEXT:    retq
4980entry:
4981  %0 = extractelement <4 x float> %__W, i64 0
4982  %1 = extractelement <4 x float> %__X, i64 0
4983  %2 = extractelement <4 x float> %__Y, i64 0
4984  %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #10
4985  %4 = and i8 %__U, 1
4986  %tobool.i = icmp eq i8 %4, 0
4987  %vecext1.i = extractelement <4 x float> %__Y, i32 0
4988  %cond.i = select i1 %tobool.i, float %vecext1.i, float %3
4989  %vecins.i = insertelement <4 x float> %__Y, float %cond.i, i32 0
4990  ret <4 x float> %vecins.i
4991}
4992
4993define <4 x float> @test_mm_mask3_fmadd_round_ss(<4 x float> %__W, <4 x float> %__X, <4 x float> %__Y, i8 zeroext %__U) {
4994; X86-LABEL: test_mm_mask3_fmadd_round_ss:
4995; X86:       # %bb.0: # %entry
4996; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
4997; X86-NEXT:    kmovw %eax, %k1
4998; X86-NEXT:    vfmadd231ss {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
4999; X86-NEXT:    vmovaps %xmm2, %xmm0
5000; X86-NEXT:    retl
5001;
5002; X64-LABEL: test_mm_mask3_fmadd_round_ss:
5003; X64:       # %bb.0: # %entry
5004; X64-NEXT:    kmovw %edi, %k1
5005; X64-NEXT:    vfmadd231ss {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
5006; X64-NEXT:    vmovaps %xmm2, %xmm0
5007; X64-NEXT:    retq
5008entry:
5009  %0 = extractelement <4 x float> %__W, i64 0
5010  %1 = extractelement <4 x float> %__X, i64 0
5011  %2 = extractelement <4 x float> %__Y, i64 0
5012  %3 = tail call float @llvm.x86.avx512.vfmadd.f32(float %0, float %1, float %2, i32 8)
5013  %4 = bitcast i8 %__U to <8 x i1>
5014  %5 = extractelement <8 x i1> %4, i64 0
5015  %6 = select i1 %5, float %3, float %2
5016  %7 = insertelement <4 x float> %__Y, float %6, i64 0
5017  ret <4 x float> %7
5018}
5019
5020define <4 x float> @test_mm_mask_fmsub_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
5021; X86-LABEL: test_mm_mask_fmsub_ss:
5022; X86:       # %bb.0: # %entry
5023; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
5024; X86-NEXT:    kmovw %eax, %k1
5025; X86-NEXT:    vfmsub213ss {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2
5026; X86-NEXT:    retl
5027;
5028; X64-LABEL: test_mm_mask_fmsub_ss:
5029; X64:       # %bb.0: # %entry
5030; X64-NEXT:    kmovw %edi, %k1
5031; X64-NEXT:    vfmsub213ss {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2
5032; X64-NEXT:    retq
5033entry:
5034  %0 = extractelement <4 x float> %__W, i64 0
5035  %1 = extractelement <4 x float> %__A, i64 0
5036  %.rhs.i = extractelement <4 x float> %__B, i64 0
5037  %2 = fsub float -0.000000e+00, %.rhs.i
5038  %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #10
5039  %4 = and i8 %__U, 1
5040  %tobool.i = icmp eq i8 %4, 0
5041  %vecext1.i = extractelement <4 x float> %__W, i32 0
5042  %cond.i = select i1 %tobool.i, float %vecext1.i, float %3
5043  %vecins.i = insertelement <4 x float> %__W, float %cond.i, i32 0
5044  ret <4 x float> %vecins.i
5045}
5046
5047define <4 x float> @test_mm_mask_fmsub_round_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
5048; X86-LABEL: test_mm_mask_fmsub_round_ss:
5049; X86:       # %bb.0: # %entry
5050; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
5051; X86-NEXT:    kmovw %eax, %k1
5052; X86-NEXT:    vfmsub213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1}
5053; X86-NEXT:    retl
5054;
5055; X64-LABEL: test_mm_mask_fmsub_round_ss:
5056; X64:       # %bb.0: # %entry
5057; X64-NEXT:    kmovw %edi, %k1
5058; X64-NEXT:    vfmsub213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1}
5059; X64-NEXT:    retq
5060entry:
5061  %0 = extractelement <4 x float> %__W, i64 0
5062  %1 = extractelement <4 x float> %__A, i64 0
5063  %.rhs = extractelement <4 x float> %__B, i64 0
5064  %2 = fsub float -0.000000e+00, %.rhs
5065  %3 = tail call float @llvm.x86.avx512.vfmadd.f32(float %0, float %1, float %2, i32 8)
5066  %4 = bitcast i8 %__U to <8 x i1>
5067  %5 = extractelement <8 x i1> %4, i64 0
5068  %6 = select i1 %5, float %3, float %0
5069  %7 = insertelement <4 x float> %__W, float %6, i64 0
5070  ret <4 x float> %7
5071}
5072
5073define <4 x float> @test_mm_maskz_fmsub_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) {
5074; X86-LABEL: test_mm_maskz_fmsub_ss:
5075; X86:       # %bb.0: # %entry
5076; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
5077; X86-NEXT:    kmovw %eax, %k1
5078; X86-NEXT:    vfmsub213ss {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2
5079; X86-NEXT:    retl
5080;
5081; X64-LABEL: test_mm_maskz_fmsub_ss:
5082; X64:       # %bb.0: # %entry
5083; X64-NEXT:    kmovw %edi, %k1
5084; X64-NEXT:    vfmsub213ss {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2
5085; X64-NEXT:    retq
5086entry:
5087  %0 = extractelement <4 x float> %__A, i64 0
5088  %1 = extractelement <4 x float> %__B, i64 0
5089  %.rhs.i = extractelement <4 x float> %__C, i64 0
5090  %2 = fsub float -0.000000e+00, %.rhs.i
5091  %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #10
5092  %4 = and i8 %__U, 1
5093  %tobool.i = icmp eq i8 %4, 0
5094  %cond.i = select i1 %tobool.i, float 0.000000e+00, float %3
5095  %vecins.i = insertelement <4 x float> %__A, float %cond.i, i32 0
5096  ret <4 x float> %vecins.i
5097}
5098
5099define <4 x float> @test_mm_maskz_fmsub_round_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) {
5100; X86-LABEL: test_mm_maskz_fmsub_round_ss:
5101; X86:       # %bb.0: # %entry
5102; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
5103; X86-NEXT:    kmovw %eax, %k1
5104; X86-NEXT:    vfmsub213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
5105; X86-NEXT:    retl
5106;
5107; X64-LABEL: test_mm_maskz_fmsub_round_ss:
5108; X64:       # %bb.0: # %entry
5109; X64-NEXT:    kmovw %edi, %k1
5110; X64-NEXT:    vfmsub213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
5111; X64-NEXT:    retq
5112entry:
5113  %0 = extractelement <4 x float> %__A, i64 0
5114  %1 = extractelement <4 x float> %__B, i64 0
5115  %.rhs = extractelement <4 x float> %__C, i64 0
5116  %2 = fsub float -0.000000e+00, %.rhs
5117  %3 = tail call float @llvm.x86.avx512.vfmadd.f32(float %0, float %1, float %2, i32 8)
5118  %4 = bitcast i8 %__U to <8 x i1>
5119  %5 = extractelement <8 x i1> %4, i64 0
5120  %6 = select i1 %5, float %3, float 0.000000e+00
5121  %7 = insertelement <4 x float> %__A, float %6, i64 0
5122  ret <4 x float> %7
5123}
5124
5125define <4 x float> @test_mm_mask3_fmsub_ss(<4 x float> %__W, <4 x float> %__X, <4 x float> %__Y, i8 zeroext %__U) {
5126; X86-LABEL: test_mm_mask3_fmsub_ss:
5127; X86:       # %bb.0: # %entry
5128; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
5129; X86-NEXT:    kmovw %eax, %k1
5130; X86-NEXT:    vfmsub231ss {{.*#+}} xmm2 = (xmm0 * xmm1) - xmm2
5131; X86-NEXT:    vmovaps %xmm2, %xmm0
5132; X86-NEXT:    retl
5133;
5134; X64-LABEL: test_mm_mask3_fmsub_ss:
5135; X64:       # %bb.0: # %entry
5136; X64-NEXT:    kmovw %edi, %k1
5137; X64-NEXT:    vfmsub231ss {{.*#+}} xmm2 = (xmm0 * xmm1) - xmm2
5138; X64-NEXT:    vmovaps %xmm2, %xmm0
5139; X64-NEXT:    retq
5140entry:
5141  %0 = extractelement <4 x float> %__W, i64 0
5142  %1 = extractelement <4 x float> %__X, i64 0
5143  %.rhs.i = extractelement <4 x float> %__Y, i64 0
5144  %2 = fsub float -0.000000e+00, %.rhs.i
5145  %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #10
5146  %4 = and i8 %__U, 1
5147  %tobool.i = icmp eq i8 %4, 0
5148  %vecext1.i = extractelement <4 x float> %__Y, i32 0
5149  %cond.i = select i1 %tobool.i, float %vecext1.i, float %3
5150  %vecins.i = insertelement <4 x float> %__Y, float %cond.i, i32 0
5151  ret <4 x float> %vecins.i
5152}
5153
5154define <4 x float> @test_mm_mask3_fmsub_round_ss(<4 x float> %__W, <4 x float> %__X, <4 x float> %__Y, i8 zeroext %__U) {
5155; X86-LABEL: test_mm_mask3_fmsub_round_ss:
5156; X86:       # %bb.0: # %entry
5157; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
5158; X86-NEXT:    kmovw %eax, %k1
5159; X86-NEXT:    vfmsub231ss {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
5160; X86-NEXT:    vmovaps %xmm2, %xmm0
5161; X86-NEXT:    retl
5162;
5163; X64-LABEL: test_mm_mask3_fmsub_round_ss:
5164; X64:       # %bb.0: # %entry
5165; X64-NEXT:    kmovw %edi, %k1
5166; X64-NEXT:    vfmsub231ss {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
5167; X64-NEXT:    vmovaps %xmm2, %xmm0
5168; X64-NEXT:    retq
5169entry:
5170  %0 = extractelement <4 x float> %__W, i64 0
5171  %1 = extractelement <4 x float> %__X, i64 0
5172  %.rhs = extractelement <4 x float> %__Y, i64 0
5173  %2 = fsub float -0.000000e+00, %.rhs
5174  %3 = tail call float @llvm.x86.avx512.vfmadd.f32(float %0, float %1, float %2, i32 8)
5175  %4 = bitcast i8 %__U to <8 x i1>
5176  %5 = extractelement <8 x i1> %4, i64 0
5177  %6 = select i1 %5, float %3, float %.rhs
5178  %7 = insertelement <4 x float> %__Y, float %6, i64 0
5179  ret <4 x float> %7
5180}
5181
5182define <4 x float> @test_mm_mask_fnmadd_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
5183; X86-LABEL: test_mm_mask_fnmadd_ss:
5184; X86:       # %bb.0: # %entry
5185; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
5186; X86-NEXT:    kmovw %eax, %k1
5187; X86-NEXT:    vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2
5188; X86-NEXT:    retl
5189;
5190; X64-LABEL: test_mm_mask_fnmadd_ss:
5191; X64:       # %bb.0: # %entry
5192; X64-NEXT:    kmovw %edi, %k1
5193; X64-NEXT:    vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2
5194; X64-NEXT:    retq
5195entry:
5196  %0 = extractelement <4 x float> %__W, i64 0
5197  %.rhs.i = extractelement <4 x float> %__A, i64 0
5198  %1 = fsub float -0.000000e+00, %.rhs.i
5199  %2 = extractelement <4 x float> %__B, i64 0
5200  %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #10
5201  %4 = and i8 %__U, 1
5202  %tobool.i = icmp eq i8 %4, 0
5203  %vecext1.i = extractelement <4 x float> %__W, i32 0
5204  %cond.i = select i1 %tobool.i, float %vecext1.i, float %3
5205  %vecins.i = insertelement <4 x float> %__W, float %cond.i, i32 0
5206  ret <4 x float> %vecins.i
5207}
5208
5209define <4 x float> @test_mm_mask_fnmadd_round_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
5210; X86-LABEL: test_mm_mask_fnmadd_round_ss:
5211; X86:       # %bb.0: # %entry
5212; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
5213; X86-NEXT:    kmovw %eax, %k1
5214; X86-NEXT:    vfnmadd213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1}
5215; X86-NEXT:    retl
5216;
5217; X64-LABEL: test_mm_mask_fnmadd_round_ss:
5218; X64:       # %bb.0: # %entry
5219; X64-NEXT:    kmovw %edi, %k1
5220; X64-NEXT:    vfnmadd213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1}
5221; X64-NEXT:    retq
5222entry:
5223  %0 = extractelement <4 x float> %__W, i64 0
5224  %.rhs = extractelement <4 x float> %__A, i64 0
5225  %1 = fsub float -0.000000e+00, %.rhs
5226  %2 = extractelement <4 x float> %__B, i64 0
5227  %3 = tail call float @llvm.x86.avx512.vfmadd.f32(float %0, float %1, float %2, i32 8)
5228  %4 = bitcast i8 %__U to <8 x i1>
5229  %5 = extractelement <8 x i1> %4, i64 0
5230  %6 = select i1 %5, float %3, float %0
5231  %7 = insertelement <4 x float> %__W, float %6, i64 0
5232  ret <4 x float> %7
5233}
5234
5235define <4 x float> @test_mm_maskz_fnmadd_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) {
5236; X86-LABEL: test_mm_maskz_fnmadd_ss:
5237; X86:       # %bb.0: # %entry
5238; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
5239; X86-NEXT:    kmovw %eax, %k1
5240; X86-NEXT:    vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2
5241; X86-NEXT:    retl
5242;
5243; X64-LABEL: test_mm_maskz_fnmadd_ss:
5244; X64:       # %bb.0: # %entry
5245; X64-NEXT:    kmovw %edi, %k1
5246; X64-NEXT:    vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2
5247; X64-NEXT:    retq
5248entry:
5249  %0 = extractelement <4 x float> %__A, i64 0
5250  %.rhs.i = extractelement <4 x float> %__B, i64 0
5251  %1 = fsub float -0.000000e+00, %.rhs.i
5252  %2 = extractelement <4 x float> %__C, i64 0
5253  %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #10
5254  %4 = and i8 %__U, 1
5255  %tobool.i = icmp eq i8 %4, 0
5256  %cond.i = select i1 %tobool.i, float 0.000000e+00, float %3
5257  %vecins.i = insertelement <4 x float> %__A, float %cond.i, i32 0
5258  ret <4 x float> %vecins.i
5259}
5260
5261define <4 x float> @test_mm_maskz_fnmadd_round_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) {
5262; X86-LABEL: test_mm_maskz_fnmadd_round_ss:
5263; X86:       # %bb.0: # %entry
5264; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
5265; X86-NEXT:    kmovw %eax, %k1
5266; X86-NEXT:    vfnmadd213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
5267; X86-NEXT:    retl
5268;
5269; X64-LABEL: test_mm_maskz_fnmadd_round_ss:
5270; X64:       # %bb.0: # %entry
5271; X64-NEXT:    kmovw %edi, %k1
5272; X64-NEXT:    vfnmadd213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
5273; X64-NEXT:    retq
5274entry:
5275  %0 = extractelement <4 x float> %__A, i64 0
5276  %.rhs = extractelement <4 x float> %__B, i64 0
5277  %1 = fsub float -0.000000e+00, %.rhs
5278  %2 = extractelement <4 x float> %__C, i64 0
5279  %3 = tail call float @llvm.x86.avx512.vfmadd.f32(float %0, float %1, float %2, i32 8)
5280  %4 = bitcast i8 %__U to <8 x i1>
5281  %5 = extractelement <8 x i1> %4, i64 0
5282  %6 = select i1 %5, float %3, float 0.000000e+00
5283  %7 = insertelement <4 x float> %__A, float %6, i64 0
5284  ret <4 x float> %7
5285}
5286
5287define <4 x float> @test_mm_mask3_fnmadd_ss(<4 x float> %__W, <4 x float> %__X, <4 x float> %__Y, i8 zeroext %__U) {
5288; X86-LABEL: test_mm_mask3_fnmadd_ss:
5289; X86:       # %bb.0: # %entry
5290; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
5291; X86-NEXT:    kmovw %eax, %k1
5292; X86-NEXT:    vfnmadd231ss {{.*#+}} xmm2 = -(xmm0 * xmm1) + xmm2
5293; X86-NEXT:    vmovaps %xmm2, %xmm0
5294; X86-NEXT:    retl
5295;
5296; X64-LABEL: test_mm_mask3_fnmadd_ss:
5297; X64:       # %bb.0: # %entry
5298; X64-NEXT:    kmovw %edi, %k1
5299; X64-NEXT:    vfnmadd231ss {{.*#+}} xmm2 = -(xmm0 * xmm1) + xmm2
5300; X64-NEXT:    vmovaps %xmm2, %xmm0
5301; X64-NEXT:    retq
5302entry:
5303  %0 = extractelement <4 x float> %__W, i64 0
5304  %.rhs.i = extractelement <4 x float> %__X, i64 0
5305  %1 = fsub float -0.000000e+00, %.rhs.i
5306  %2 = extractelement <4 x float> %__Y, i64 0
5307  %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #10
5308  %4 = and i8 %__U, 1
5309  %tobool.i = icmp eq i8 %4, 0
5310  %vecext1.i = extractelement <4 x float> %__Y, i32 0
5311  %cond.i = select i1 %tobool.i, float %vecext1.i, float %3
5312  %vecins.i = insertelement <4 x float> %__Y, float %cond.i, i32 0
5313  ret <4 x float> %vecins.i
5314}
5315
5316define <4 x float> @test_mm_mask3_fnmadd_round_ss(<4 x float> %__W, <4 x float> %__X, <4 x float> %__Y, i8 zeroext %__U) {
5317; X86-LABEL: test_mm_mask3_fnmadd_round_ss:
5318; X86:       # %bb.0: # %entry
5319; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
5320; X86-NEXT:    kmovw %eax, %k1
5321; X86-NEXT:    vfnmadd231ss {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
5322; X86-NEXT:    vmovaps %xmm2, %xmm0
5323; X86-NEXT:    retl
5324;
5325; X64-LABEL: test_mm_mask3_fnmadd_round_ss:
5326; X64:       # %bb.0: # %entry
5327; X64-NEXT:    kmovw %edi, %k1
5328; X64-NEXT:    vfnmadd231ss {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
5329; X64-NEXT:    vmovaps %xmm2, %xmm0
5330; X64-NEXT:    retq
5331entry:
5332  %0 = extractelement <4 x float> %__W, i64 0
5333  %.rhs = extractelement <4 x float> %__X, i64 0
5334  %1 = fsub float -0.000000e+00, %.rhs
5335  %2 = extractelement <4 x float> %__Y, i64 0
5336  %3 = tail call float @llvm.x86.avx512.vfmadd.f32(float %0, float %1, float %2, i32 8)
5337  %4 = bitcast i8 %__U to <8 x i1>
5338  %5 = extractelement <8 x i1> %4, i64 0
5339  %6 = select i1 %5, float %3, float %2
5340  %7 = insertelement <4 x float> %__Y, float %6, i64 0
5341  ret <4 x float> %7
5342}
5343
5344define <4 x float> @test_mm_mask_fnmsub_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
5345; X86-LABEL: test_mm_mask_fnmsub_ss:
5346; X86:       # %bb.0: # %entry
5347; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
5348; X86-NEXT:    kmovw %eax, %k1
5349; X86-NEXT:    vfnmsub213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2
5350; X86-NEXT:    retl
5351;
5352; X64-LABEL: test_mm_mask_fnmsub_ss:
5353; X64:       # %bb.0: # %entry
5354; X64-NEXT:    kmovw %edi, %k1
5355; X64-NEXT:    vfnmsub213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2
5356; X64-NEXT:    retq
5357entry:
5358  %0 = extractelement <4 x float> %__W, i64 0
5359  %.rhs.i = extractelement <4 x float> %__A, i64 0
5360  %1 = fsub float -0.000000e+00, %.rhs.i
5361  %.rhs7.i = extractelement <4 x float> %__B, i64 0
5362  %2 = fsub float -0.000000e+00, %.rhs7.i
5363  %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #10
5364  %4 = and i8 %__U, 1
5365  %tobool.i = icmp eq i8 %4, 0
5366  %vecext2.i = extractelement <4 x float> %__W, i32 0
5367  %cond.i = select i1 %tobool.i, float %vecext2.i, float %3
5368  %vecins.i = insertelement <4 x float> %__W, float %cond.i, i32 0
5369  ret <4 x float> %vecins.i
5370}
5371
5372define <4 x float> @test_mm_mask_fnmsub_round_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
5373; X86-LABEL: test_mm_mask_fnmsub_round_ss:
5374; X86:       # %bb.0: # %entry
5375; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
5376; X86-NEXT:    kmovw %eax, %k1
5377; X86-NEXT:    vfnmsub213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1}
5378; X86-NEXT:    retl
5379;
5380; X64-LABEL: test_mm_mask_fnmsub_round_ss:
5381; X64:       # %bb.0: # %entry
5382; X64-NEXT:    kmovw %edi, %k1
5383; X64-NEXT:    vfnmsub213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1}
5384; X64-NEXT:    retq
5385entry:
5386  %0 = extractelement <4 x float> %__W, i64 0
5387  %.rhs = extractelement <4 x float> %__A, i64 0
5388  %1 = fsub float -0.000000e+00, %.rhs
5389  %.rhs2 = extractelement <4 x float> %__B, i64 0
5390  %2 = fsub float -0.000000e+00, %.rhs2
5391  %3 = tail call float @llvm.x86.avx512.vfmadd.f32(float %0, float %1, float %2, i32 8)
5392  %4 = bitcast i8 %__U to <8 x i1>
5393  %5 = extractelement <8 x i1> %4, i64 0
5394  %6 = select i1 %5, float %3, float %0
5395  %7 = insertelement <4 x float> %__W, float %6, i64 0
5396  ret <4 x float> %7
5397}
5398
5399define <4 x float> @test_mm_maskz_fnmsub_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) {
5400; X86-LABEL: test_mm_maskz_fnmsub_ss:
5401; X86:       # %bb.0: # %entry
5402; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
5403; X86-NEXT:    kmovw %eax, %k1
5404; X86-NEXT:    vfnmsub213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2
5405; X86-NEXT:    retl
5406;
5407; X64-LABEL: test_mm_maskz_fnmsub_ss:
5408; X64:       # %bb.0: # %entry
5409; X64-NEXT:    kmovw %edi, %k1
5410; X64-NEXT:    vfnmsub213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2
5411; X64-NEXT:    retq
5412entry:
5413  %0 = extractelement <4 x float> %__A, i64 0
5414  %.rhs.i = extractelement <4 x float> %__B, i64 0
5415  %1 = fsub float -0.000000e+00, %.rhs.i
5416  %.rhs5.i = extractelement <4 x float> %__C, i64 0
5417  %2 = fsub float -0.000000e+00, %.rhs5.i
5418  %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #10
5419  %4 = and i8 %__U, 1
5420  %tobool.i = icmp eq i8 %4, 0
5421  %cond.i = select i1 %tobool.i, float 0.000000e+00, float %3
5422  %vecins.i = insertelement <4 x float> %__A, float %cond.i, i32 0
5423  ret <4 x float> %vecins.i
5424}
5425
5426define <4 x float> @test_mm_maskz_fnmsub_round_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) {
5427; X86-LABEL: test_mm_maskz_fnmsub_round_ss:
5428; X86:       # %bb.0: # %entry
5429; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
5430; X86-NEXT:    kmovw %eax, %k1
5431; X86-NEXT:    vfnmsub213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
5432; X86-NEXT:    retl
5433;
5434; X64-LABEL: test_mm_maskz_fnmsub_round_ss:
5435; X64:       # %bb.0: # %entry
5436; X64-NEXT:    kmovw %edi, %k1
5437; X64-NEXT:    vfnmsub213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
5438; X64-NEXT:    retq
5439entry:
5440  %0 = extractelement <4 x float> %__A, i64 0
5441  %.rhs = extractelement <4 x float> %__B, i64 0
5442  %1 = fsub float -0.000000e+00, %.rhs
5443  %.rhs2 = extractelement <4 x float> %__C, i64 0
5444  %2 = fsub float -0.000000e+00, %.rhs2
5445  %3 = tail call float @llvm.x86.avx512.vfmadd.f32(float %0, float %1, float %2, i32 8)
5446  %4 = bitcast i8 %__U to <8 x i1>
5447  %5 = extractelement <8 x i1> %4, i64 0
5448  %6 = select i1 %5, float %3, float 0.000000e+00
5449  %7 = insertelement <4 x float> %__A, float %6, i64 0
5450  ret <4 x float> %7
5451}
5452
5453define <4 x float> @test_mm_mask3_fnmsub_ss(<4 x float> %__W, <4 x float> %__X, <4 x float> %__Y, i8 zeroext %__U) {
5454; X86-LABEL: test_mm_mask3_fnmsub_ss:
5455; X86:       # %bb.0: # %entry
5456; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
5457; X86-NEXT:    kmovw %eax, %k1
5458; X86-NEXT:    vfnmsub231ss {{.*#+}} xmm2 = -(xmm0 * xmm1) - xmm2
5459; X86-NEXT:    vmovaps %xmm2, %xmm0
5460; X86-NEXT:    retl
5461;
5462; X64-LABEL: test_mm_mask3_fnmsub_ss:
5463; X64:       # %bb.0: # %entry
5464; X64-NEXT:    kmovw %edi, %k1
5465; X64-NEXT:    vfnmsub231ss {{.*#+}} xmm2 = -(xmm0 * xmm1) - xmm2
5466; X64-NEXT:    vmovaps %xmm2, %xmm0
5467; X64-NEXT:    retq
5468entry:
5469  %0 = extractelement <4 x float> %__W, i64 0
5470  %.rhs.i = extractelement <4 x float> %__X, i64 0
5471  %1 = fsub float -0.000000e+00, %.rhs.i
5472  %.rhs7.i = extractelement <4 x float> %__Y, i64 0
5473  %2 = fsub float -0.000000e+00, %.rhs7.i
5474  %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #10
5475  %4 = and i8 %__U, 1
5476  %tobool.i = icmp eq i8 %4, 0
5477  %vecext2.i = extractelement <4 x float> %__Y, i32 0
5478  %cond.i = select i1 %tobool.i, float %vecext2.i, float %3
5479  %vecins.i = insertelement <4 x float> %__Y, float %cond.i, i32 0
5480  ret <4 x float> %vecins.i
5481}
5482
5483define <4 x float> @test_mm_mask3_fnmsub_round_ss(<4 x float> %__W, <4 x float> %__X, <4 x float> %__Y, i8 zeroext %__U) {
5484; X86-LABEL: test_mm_mask3_fnmsub_round_ss:
5485; X86:       # %bb.0: # %entry
5486; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
5487; X86-NEXT:    kmovw %eax, %k1
5488; X86-NEXT:    vfnmsub231ss {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
5489; X86-NEXT:    vmovaps %xmm2, %xmm0
5490; X86-NEXT:    retl
5491;
5492; X64-LABEL: test_mm_mask3_fnmsub_round_ss:
5493; X64:       # %bb.0: # %entry
5494; X64-NEXT:    kmovw %edi, %k1
5495; X64-NEXT:    vfnmsub231ss {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
5496; X64-NEXT:    vmovaps %xmm2, %xmm0
5497; X64-NEXT:    retq
5498entry:
5499  %0 = extractelement <4 x float> %__W, i64 0
5500  %.rhs = extractelement <4 x float> %__X, i64 0
5501  %1 = fsub float -0.000000e+00, %.rhs
5502  %.rhs1 = extractelement <4 x float> %__Y, i64 0
5503  %2 = fsub float -0.000000e+00, %.rhs1
5504  %3 = tail call float @llvm.x86.avx512.vfmadd.f32(float %0, float %1, float %2, i32 8)
5505  %4 = bitcast i8 %__U to <8 x i1>
5506  %5 = extractelement <8 x i1> %4, i64 0
5507  %6 = select i1 %5, float %3, float %.rhs1
5508  %7 = insertelement <4 x float> %__Y, float %6, i64 0
5509  ret <4 x float> %7
5510}
5511
5512define <2 x double> @test_mm_mask_fmadd_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
5513; X86-LABEL: test_mm_mask_fmadd_sd:
5514; X86:       # %bb.0: # %entry
5515; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
5516; X86-NEXT:    kmovw %eax, %k1
5517; X86-NEXT:    vfmadd213sd {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
5518; X86-NEXT:    retl
5519;
5520; X64-LABEL: test_mm_mask_fmadd_sd:
5521; X64:       # %bb.0: # %entry
5522; X64-NEXT:    kmovw %edi, %k1
5523; X64-NEXT:    vfmadd213sd {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
5524; X64-NEXT:    retq
5525entry:
5526  %0 = extractelement <2 x double> %__W, i64 0
5527  %1 = extractelement <2 x double> %__A, i64 0
5528  %2 = extractelement <2 x double> %__B, i64 0
5529  %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #10
5530  %4 = and i8 %__U, 1
5531  %tobool.i = icmp eq i8 %4, 0
5532  %vecext1.i = extractelement <2 x double> %__W, i32 0
5533  %cond.i = select i1 %tobool.i, double %vecext1.i, double %3
5534  %vecins.i = insertelement <2 x double> %__W, double %cond.i, i32 0
5535  ret <2 x double> %vecins.i
5536}
5537
5538define <2 x double> @test_mm_mask_fmadd_round_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
5539; X86-LABEL: test_mm_mask_fmadd_round_sd:
5540; X86:       # %bb.0: # %entry
5541; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
5542; X86-NEXT:    kmovw %eax, %k1
5543; X86-NEXT:    vfmadd213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1}
5544; X86-NEXT:    retl
5545;
5546; X64-LABEL: test_mm_mask_fmadd_round_sd:
5547; X64:       # %bb.0: # %entry
5548; X64-NEXT:    kmovw %edi, %k1
5549; X64-NEXT:    vfmadd213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1}
5550; X64-NEXT:    retq
5551entry:
5552  %0 = extractelement <2 x double> %__W, i64 0
5553  %1 = extractelement <2 x double> %__A, i64 0
5554  %2 = extractelement <2 x double> %__B, i64 0
5555  %3 = tail call double @llvm.x86.avx512.vfmadd.f64(double %0, double %1, double %2, i32 8)
5556  %4 = bitcast i8 %__U to <8 x i1>
5557  %5 = extractelement <8 x i1> %4, i64 0
5558  %6 = select i1 %5, double %3, double %0
5559  %7 = insertelement <2 x double> %__W, double %6, i64 0
5560  ret <2 x double> %7
5561}
5562
5563declare double @llvm.x86.avx512.vfmadd.f64(double, double, double, i32) #1
5564
5565define <2 x double> @test_mm_maskz_fmadd_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) {
5566; X86-LABEL: test_mm_maskz_fmadd_sd:
5567; X86:       # %bb.0: # %entry
5568; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
5569; X86-NEXT:    kmovw %eax, %k1
5570; X86-NEXT:    vfmadd213sd {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
5571; X86-NEXT:    retl
5572;
5573; X64-LABEL: test_mm_maskz_fmadd_sd:
5574; X64:       # %bb.0: # %entry
5575; X64-NEXT:    kmovw %edi, %k1
5576; X64-NEXT:    vfmadd213sd {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
5577; X64-NEXT:    retq
5578entry:
5579  %0 = extractelement <2 x double> %__A, i64 0
5580  %1 = extractelement <2 x double> %__B, i64 0
5581  %2 = extractelement <2 x double> %__C, i64 0
5582  %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #10
5583  %4 = and i8 %__U, 1
5584  %tobool.i = icmp eq i8 %4, 0
5585  %cond.i = select i1 %tobool.i, double 0.000000e+00, double %3
5586  %vecins.i = insertelement <2 x double> %__A, double %cond.i, i32 0
5587  ret <2 x double> %vecins.i
5588}
5589
5590define <2 x double> @test_mm_maskz_fmadd_round_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) {
5591; X86-LABEL: test_mm_maskz_fmadd_round_sd:
5592; X86:       # %bb.0: # %entry
5593; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
5594; X86-NEXT:    kmovw %eax, %k1
5595; X86-NEXT:    vfmadd213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
5596; X86-NEXT:    retl
5597;
5598; X64-LABEL: test_mm_maskz_fmadd_round_sd:
5599; X64:       # %bb.0: # %entry
5600; X64-NEXT:    kmovw %edi, %k1
5601; X64-NEXT:    vfmadd213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
5602; X64-NEXT:    retq
5603entry:
5604  %0 = extractelement <2 x double> %__A, i64 0
5605  %1 = extractelement <2 x double> %__B, i64 0
5606  %2 = extractelement <2 x double> %__C, i64 0
5607  %3 = tail call double @llvm.x86.avx512.vfmadd.f64(double %0, double %1, double %2, i32 8)
5608  %4 = bitcast i8 %__U to <8 x i1>
5609  %5 = extractelement <8 x i1> %4, i64 0
5610  %6 = select i1 %5, double %3, double 0.000000e+00
5611  %7 = insertelement <2 x double> %__A, double %6, i64 0
5612  ret <2 x double> %7
5613}
5614
5615define <2 x double> @test_mm_mask3_fmadd_sd(<2 x double> %__W, <2 x double> %__X, <2 x double> %__Y, i8 zeroext %__U) {
5616; X86-LABEL: test_mm_mask3_fmadd_sd:
5617; X86:       # %bb.0: # %entry
5618; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
5619; X86-NEXT:    kmovw %eax, %k1
5620; X86-NEXT:    vfmadd231sd {{.*#+}} xmm2 = (xmm0 * xmm1) + xmm2
5621; X86-NEXT:    vmovapd %xmm2, %xmm0
5622; X86-NEXT:    retl
5623;
5624; X64-LABEL: test_mm_mask3_fmadd_sd:
5625; X64:       # %bb.0: # %entry
5626; X64-NEXT:    kmovw %edi, %k1
5627; X64-NEXT:    vfmadd231sd {{.*#+}} xmm2 = (xmm0 * xmm1) + xmm2
5628; X64-NEXT:    vmovapd %xmm2, %xmm0
5629; X64-NEXT:    retq
5630entry:
5631  %0 = extractelement <2 x double> %__W, i64 0
5632  %1 = extractelement <2 x double> %__X, i64 0
5633  %2 = extractelement <2 x double> %__Y, i64 0
5634  %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #10
5635  %4 = and i8 %__U, 1
5636  %tobool.i = icmp eq i8 %4, 0
5637  %vecext1.i = extractelement <2 x double> %__Y, i32 0
5638  %cond.i = select i1 %tobool.i, double %vecext1.i, double %3
5639  %vecins.i = insertelement <2 x double> %__Y, double %cond.i, i32 0
5640  ret <2 x double> %vecins.i
5641}
5642
5643define <2 x double> @test_mm_mask3_fmadd_round_sd(<2 x double> %__W, <2 x double> %__X, <2 x double> %__Y, i8 zeroext %__U) {
5644; X86-LABEL: test_mm_mask3_fmadd_round_sd:
5645; X86:       # %bb.0: # %entry
5646; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
5647; X86-NEXT:    kmovw %eax, %k1
5648; X86-NEXT:    vfmadd231sd {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
5649; X86-NEXT:    vmovapd %xmm2, %xmm0
5650; X86-NEXT:    retl
5651;
5652; X64-LABEL: test_mm_mask3_fmadd_round_sd:
5653; X64:       # %bb.0: # %entry
5654; X64-NEXT:    kmovw %edi, %k1
5655; X64-NEXT:    vfmadd231sd {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
5656; X64-NEXT:    vmovapd %xmm2, %xmm0
5657; X64-NEXT:    retq
5658entry:
5659  %0 = extractelement <2 x double> %__W, i64 0
5660  %1 = extractelement <2 x double> %__X, i64 0
5661  %2 = extractelement <2 x double> %__Y, i64 0
5662  %3 = tail call double @llvm.x86.avx512.vfmadd.f64(double %0, double %1, double %2, i32 8)
5663  %4 = bitcast i8 %__U to <8 x i1>
5664  %5 = extractelement <8 x i1> %4, i64 0
5665  %6 = select i1 %5, double %3, double %2
5666  %7 = insertelement <2 x double> %__Y, double %6, i64 0
5667  ret <2 x double> %7
5668}
5669
5670define <2 x double> @test_mm_mask_fmsub_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
5671; X86-LABEL: test_mm_mask_fmsub_sd:
5672; X86:       # %bb.0: # %entry
5673; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
5674; X86-NEXT:    kmovw %eax, %k1
5675; X86-NEXT:    vfmsub213sd {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2
5676; X86-NEXT:    retl
5677;
5678; X64-LABEL: test_mm_mask_fmsub_sd:
5679; X64:       # %bb.0: # %entry
5680; X64-NEXT:    kmovw %edi, %k1
5681; X64-NEXT:    vfmsub213sd {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2
5682; X64-NEXT:    retq
5683entry:
5684  %0 = extractelement <2 x double> %__W, i64 0
5685  %1 = extractelement <2 x double> %__A, i64 0
5686  %.rhs.i = extractelement <2 x double> %__B, i64 0
5687  %2 = fsub double -0.000000e+00, %.rhs.i
5688  %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #10
5689  %4 = and i8 %__U, 1
5690  %tobool.i = icmp eq i8 %4, 0
5691  %vecext1.i = extractelement <2 x double> %__W, i32 0
5692  %cond.i = select i1 %tobool.i, double %vecext1.i, double %3
5693  %vecins.i = insertelement <2 x double> %__W, double %cond.i, i32 0
5694  ret <2 x double> %vecins.i
5695}
5696
5697define <2 x double> @test_mm_mask_fmsub_round_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
5698; X86-LABEL: test_mm_mask_fmsub_round_sd:
5699; X86:       # %bb.0: # %entry
5700; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
5701; X86-NEXT:    vxorpd {{\.LCPI.*}}, %xmm2, %xmm2
5702; X86-NEXT:    kmovw %eax, %k1
5703; X86-NEXT:    vfmadd213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1}
5704; X86-NEXT:    retl
5705;
5706; X64-LABEL: test_mm_mask_fmsub_round_sd:
5707; X64:       # %bb.0: # %entry
5708; X64-NEXT:    vxorpd {{.*}}(%rip), %xmm2, %xmm2
5709; X64-NEXT:    kmovw %edi, %k1
5710; X64-NEXT:    vfmadd213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1}
5711; X64-NEXT:    retq
5712entry:
5713  %0 = extractelement <2 x double> %__W, i64 0
5714  %1 = extractelement <2 x double> %__A, i64 0
5715  %.rhs = extractelement <2 x double> %__B, i64 0
5716  %2 = fsub double -0.000000e+00, %.rhs
5717  %3 = tail call double @llvm.x86.avx512.vfmadd.f64(double %0, double %1, double %2, i32 8)
5718  %4 = bitcast i8 %__U to <8 x i1>
5719  %5 = extractelement <8 x i1> %4, i64 0
5720  %6 = select i1 %5, double %3, double %0
5721  %7 = insertelement <2 x double> %__W, double %6, i64 0
5722  ret <2 x double> %7
5723}
5724
5725define <2 x double> @test_mm_maskz_fmsub_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) {
5726; X86-LABEL: test_mm_maskz_fmsub_sd:
5727; X86:       # %bb.0: # %entry
5728; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
5729; X86-NEXT:    kmovw %eax, %k1
5730; X86-NEXT:    vfmsub213sd {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2
5731; X86-NEXT:    retl
5732;
5733; X64-LABEL: test_mm_maskz_fmsub_sd:
5734; X64:       # %bb.0: # %entry
5735; X64-NEXT:    kmovw %edi, %k1
5736; X64-NEXT:    vfmsub213sd {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2
5737; X64-NEXT:    retq
5738entry:
5739  %0 = extractelement <2 x double> %__A, i64 0
5740  %1 = extractelement <2 x double> %__B, i64 0
5741  %.rhs.i = extractelement <2 x double> %__C, i64 0
5742  %2 = fsub double -0.000000e+00, %.rhs.i
5743  %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #10
5744  %4 = and i8 %__U, 1
5745  %tobool.i = icmp eq i8 %4, 0
5746  %cond.i = select i1 %tobool.i, double 0.000000e+00, double %3
5747  %vecins.i = insertelement <2 x double> %__A, double %cond.i, i32 0
5748  ret <2 x double> %vecins.i
5749}
5750
5751define <2 x double> @test_mm_maskz_fmsub_round_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) {
5752; X86-LABEL: test_mm_maskz_fmsub_round_sd:
5753; X86:       # %bb.0: # %entry
5754; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
5755; X86-NEXT:    vxorpd {{\.LCPI.*}}, %xmm2, %xmm2
5756; X86-NEXT:    kmovw %eax, %k1
5757; X86-NEXT:    vfmadd213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
5758; X86-NEXT:    retl
5759;
5760; X64-LABEL: test_mm_maskz_fmsub_round_sd:
5761; X64:       # %bb.0: # %entry
5762; X64-NEXT:    vxorpd {{.*}}(%rip), %xmm2, %xmm2
5763; X64-NEXT:    kmovw %edi, %k1
5764; X64-NEXT:    vfmadd213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
5765; X64-NEXT:    retq
5766entry:
5767  %0 = extractelement <2 x double> %__A, i64 0
5768  %1 = extractelement <2 x double> %__B, i64 0
5769  %.rhs = extractelement <2 x double> %__C, i64 0
5770  %2 = fsub double -0.000000e+00, %.rhs
5771  %3 = tail call double @llvm.x86.avx512.vfmadd.f64(double %0, double %1, double %2, i32 8)
5772  %4 = bitcast i8 %__U to <8 x i1>
5773  %5 = extractelement <8 x i1> %4, i64 0
5774  %6 = select i1 %5, double %3, double 0.000000e+00
5775  %7 = insertelement <2 x double> %__A, double %6, i64 0
5776  ret <2 x double> %7
5777}
5778
5779define <2 x double> @test_mm_mask3_fmsub_sd(<2 x double> %__W, <2 x double> %__X, <2 x double> %__Y, i8 zeroext %__U) {
5780; X86-LABEL: test_mm_mask3_fmsub_sd:
5781; X86:       # %bb.0: # %entry
5782; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
5783; X86-NEXT:    kmovw %eax, %k1
5784; X86-NEXT:    vfmsub231sd {{.*#+}} xmm2 = (xmm0 * xmm1) - xmm2
5785; X86-NEXT:    vmovapd %xmm2, %xmm0
5786; X86-NEXT:    retl
5787;
5788; X64-LABEL: test_mm_mask3_fmsub_sd:
5789; X64:       # %bb.0: # %entry
5790; X64-NEXT:    kmovw %edi, %k1
5791; X64-NEXT:    vfmsub231sd {{.*#+}} xmm2 = (xmm0 * xmm1) - xmm2
5792; X64-NEXT:    vmovapd %xmm2, %xmm0
5793; X64-NEXT:    retq
5794entry:
5795  %0 = extractelement <2 x double> %__W, i64 0
5796  %1 = extractelement <2 x double> %__X, i64 0
5797  %.rhs.i = extractelement <2 x double> %__Y, i64 0
5798  %2 = fsub double -0.000000e+00, %.rhs.i
5799  %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #10
5800  %4 = and i8 %__U, 1
5801  %tobool.i = icmp eq i8 %4, 0
5802  %vecext1.i = extractelement <2 x double> %__Y, i32 0
5803  %cond.i = select i1 %tobool.i, double %vecext1.i, double %3
5804  %vecins.i = insertelement <2 x double> %__Y, double %cond.i, i32 0
5805  ret <2 x double> %vecins.i
5806}
5807
5808define <2 x double> @test_mm_mask3_fmsub_round_sd(<2 x double> %__W, <2 x double> %__X, <2 x double> %__Y, i8 zeroext %__U) {
5809; X86-LABEL: test_mm_mask3_fmsub_round_sd:
5810; X86:       # %bb.0: # %entry
5811; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
5812; X86-NEXT:    vxorpd {{\.LCPI.*}}, %xmm2, %xmm3
5813; X86-NEXT:    vfmadd213sd %xmm3, %xmm0, %xmm1
5814; X86-NEXT:    kmovw %eax, %k1
5815; X86-NEXT:    vmovsd %xmm1, %xmm2, %xmm2 {%k1}
5816; X86-NEXT:    vmovapd %xmm2, %xmm0
5817; X86-NEXT:    retl
5818;
5819; X64-LABEL: test_mm_mask3_fmsub_round_sd:
5820; X64:       # %bb.0: # %entry
5821; X64-NEXT:    vxorpd {{.*}}(%rip), %xmm2, %xmm3
5822; X64-NEXT:    vfmadd213sd %xmm3, %xmm0, %xmm1
5823; X64-NEXT:    kmovw %edi, %k1
5824; X64-NEXT:    vmovsd %xmm1, %xmm2, %xmm2 {%k1}
5825; X64-NEXT:    vmovapd %xmm2, %xmm0
5826; X64-NEXT:    retq
5827entry:
5828  %0 = extractelement <2 x double> %__W, i64 0
5829  %1 = extractelement <2 x double> %__X, i64 0
5830  %.rhs = extractelement <2 x double> %__Y, i64 0
5831  %2 = fsub double -0.000000e+00, %.rhs
5832  %3 = tail call double @llvm.x86.avx512.vfmadd.f64(double %0, double %1, double %2, i32 8)
5833  %4 = bitcast i8 %__U to <8 x i1>
5834  %5 = extractelement <8 x i1> %4, i64 0
5835  %6 = select i1 %5, double %3, double %.rhs
5836  %7 = insertelement <2 x double> %__Y, double %6, i64 0
5837  ret <2 x double> %7
5838}
5839
5840define <2 x double> @test_mm_mask_fnmadd_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
5841; X86-LABEL: test_mm_mask_fnmadd_sd:
5842; X86:       # %bb.0: # %entry
5843; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
5844; X86-NEXT:    kmovw %eax, %k1
5845; X86-NEXT:    vfnmadd213sd {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2
5846; X86-NEXT:    retl
5847;
5848; X64-LABEL: test_mm_mask_fnmadd_sd:
5849; X64:       # %bb.0: # %entry
5850; X64-NEXT:    kmovw %edi, %k1
5851; X64-NEXT:    vfnmadd213sd {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2
5852; X64-NEXT:    retq
5853entry:
5854  %0 = extractelement <2 x double> %__W, i64 0
5855  %.rhs.i = extractelement <2 x double> %__A, i64 0
5856  %1 = fsub double -0.000000e+00, %.rhs.i
5857  %2 = extractelement <2 x double> %__B, i64 0
5858  %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #10
5859  %4 = and i8 %__U, 1
5860  %tobool.i = icmp eq i8 %4, 0
5861  %vecext1.i = extractelement <2 x double> %__W, i32 0
5862  %cond.i = select i1 %tobool.i, double %vecext1.i, double %3
5863  %vecins.i = insertelement <2 x double> %__W, double %cond.i, i32 0
5864  ret <2 x double> %vecins.i
5865}
5866
5867define <2 x double> @test_mm_mask_fnmadd_round_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
5868; X86-LABEL: test_mm_mask_fnmadd_round_sd:
5869; X86:       # %bb.0: # %entry
5870; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
5871; X86-NEXT:    vxorpd {{\.LCPI.*}}, %xmm1, %xmm1
5872; X86-NEXT:    kmovw %eax, %k1
5873; X86-NEXT:    vfmadd213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1}
5874; X86-NEXT:    retl
5875;
5876; X64-LABEL: test_mm_mask_fnmadd_round_sd:
5877; X64:       # %bb.0: # %entry
5878; X64-NEXT:    vxorpd {{.*}}(%rip), %xmm1, %xmm1
5879; X64-NEXT:    kmovw %edi, %k1
5880; X64-NEXT:    vfmadd213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1}
5881; X64-NEXT:    retq
5882entry:
5883  %0 = extractelement <2 x double> %__W, i64 0
5884  %.rhs = extractelement <2 x double> %__A, i64 0
5885  %1 = fsub double -0.000000e+00, %.rhs
5886  %2 = extractelement <2 x double> %__B, i64 0
5887  %3 = tail call double @llvm.x86.avx512.vfmadd.f64(double %0, double %1, double %2, i32 8)
5888  %4 = bitcast i8 %__U to <8 x i1>
5889  %5 = extractelement <8 x i1> %4, i64 0
5890  %6 = select i1 %5, double %3, double %0
5891  %7 = insertelement <2 x double> %__W, double %6, i64 0
5892  ret <2 x double> %7
5893}
5894
5895define <2 x double> @test_mm_maskz_fnmadd_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) {
5896; X86-LABEL: test_mm_maskz_fnmadd_sd:
5897; X86:       # %bb.0: # %entry
5898; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
5899; X86-NEXT:    kmovw %eax, %k1
5900; X86-NEXT:    vfnmadd213sd {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2
5901; X86-NEXT:    retl
5902;
5903; X64-LABEL: test_mm_maskz_fnmadd_sd:
5904; X64:       # %bb.0: # %entry
5905; X64-NEXT:    kmovw %edi, %k1
5906; X64-NEXT:    vfnmadd213sd {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2
5907; X64-NEXT:    retq
5908entry:
5909  %0 = extractelement <2 x double> %__A, i64 0
5910  %.rhs.i = extractelement <2 x double> %__B, i64 0
5911  %1 = fsub double -0.000000e+00, %.rhs.i
5912  %2 = extractelement <2 x double> %__C, i64 0
5913  %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #10
5914  %4 = and i8 %__U, 1
5915  %tobool.i = icmp eq i8 %4, 0
5916  %cond.i = select i1 %tobool.i, double 0.000000e+00, double %3
5917  %vecins.i = insertelement <2 x double> %__A, double %cond.i, i32 0
5918  ret <2 x double> %vecins.i
5919}
5920
5921define <2 x double> @test_mm_maskz_fnmadd_round_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) {
5922; X86-LABEL: test_mm_maskz_fnmadd_round_sd:
5923; X86:       # %bb.0: # %entry
5924; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
5925; X86-NEXT:    vxorpd {{\.LCPI.*}}, %xmm1, %xmm1
5926; X86-NEXT:    kmovw %eax, %k1
5927; X86-NEXT:    vfmadd213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
5928; X86-NEXT:    retl
5929;
5930; X64-LABEL: test_mm_maskz_fnmadd_round_sd:
5931; X64:       # %bb.0: # %entry
5932; X64-NEXT:    vxorpd {{.*}}(%rip), %xmm1, %xmm1
5933; X64-NEXT:    kmovw %edi, %k1
5934; X64-NEXT:    vfmadd213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
5935; X64-NEXT:    retq
5936entry:
5937  %0 = extractelement <2 x double> %__A, i64 0
5938  %.rhs = extractelement <2 x double> %__B, i64 0
5939  %1 = fsub double -0.000000e+00, %.rhs
5940  %2 = extractelement <2 x double> %__C, i64 0
5941  %3 = tail call double @llvm.x86.avx512.vfmadd.f64(double %0, double %1, double %2, i32 8)
5942  %4 = bitcast i8 %__U to <8 x i1>
5943  %5 = extractelement <8 x i1> %4, i64 0
5944  %6 = select i1 %5, double %3, double 0.000000e+00
5945  %7 = insertelement <2 x double> %__A, double %6, i64 0
5946  ret <2 x double> %7
5947}
5948
5949define <2 x double> @test_mm_mask3_fnmadd_sd(<2 x double> %__W, <2 x double> %__X, <2 x double> %__Y, i8 zeroext %__U) {
5950; X86-LABEL: test_mm_mask3_fnmadd_sd:
5951; X86:       # %bb.0: # %entry
5952; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
5953; X86-NEXT:    kmovw %eax, %k1
5954; X86-NEXT:    vfnmadd231sd {{.*#+}} xmm2 = -(xmm0 * xmm1) + xmm2
5955; X86-NEXT:    vmovapd %xmm2, %xmm0
5956; X86-NEXT:    retl
5957;
5958; X64-LABEL: test_mm_mask3_fnmadd_sd:
5959; X64:       # %bb.0: # %entry
5960; X64-NEXT:    kmovw %edi, %k1
5961; X64-NEXT:    vfnmadd231sd {{.*#+}} xmm2 = -(xmm0 * xmm1) + xmm2
5962; X64-NEXT:    vmovapd %xmm2, %xmm0
5963; X64-NEXT:    retq
5964entry:
5965  %0 = extractelement <2 x double> %__W, i64 0
5966  %.rhs.i = extractelement <2 x double> %__X, i64 0
5967  %1 = fsub double -0.000000e+00, %.rhs.i
5968  %2 = extractelement <2 x double> %__Y, i64 0
5969  %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #10
5970  %4 = and i8 %__U, 1
5971  %tobool.i = icmp eq i8 %4, 0
5972  %vecext1.i = extractelement <2 x double> %__Y, i32 0
5973  %cond.i = select i1 %tobool.i, double %vecext1.i, double %3
5974  %vecins.i = insertelement <2 x double> %__Y, double %cond.i, i32 0
5975  ret <2 x double> %vecins.i
5976}
5977
5978define <2 x double> @test_mm_mask3_fnmadd_round_sd(<2 x double> %__W, <2 x double> %__X, <2 x double> %__Y, i8 zeroext %__U) {
5979; X86-LABEL: test_mm_mask3_fnmadd_round_sd:
5980; X86:       # %bb.0: # %entry
5981; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
5982; X86-NEXT:    vxorpd {{\.LCPI.*}}, %xmm1, %xmm1
5983; X86-NEXT:    kmovw %eax, %k1
5984; X86-NEXT:    vfmadd231sd {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
5985; X86-NEXT:    vmovapd %xmm2, %xmm0
5986; X86-NEXT:    retl
5987;
5988; X64-LABEL: test_mm_mask3_fnmadd_round_sd:
5989; X64:       # %bb.0: # %entry
5990; X64-NEXT:    vxorpd {{.*}}(%rip), %xmm1, %xmm1
5991; X64-NEXT:    kmovw %edi, %k1
5992; X64-NEXT:    vfmadd231sd {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
5993; X64-NEXT:    vmovapd %xmm2, %xmm0
5994; X64-NEXT:    retq
5995entry:
5996  %0 = extractelement <2 x double> %__W, i64 0
5997  %.rhs = extractelement <2 x double> %__X, i64 0
5998  %1 = fsub double -0.000000e+00, %.rhs
5999  %2 = extractelement <2 x double> %__Y, i64 0
6000  %3 = tail call double @llvm.x86.avx512.vfmadd.f64(double %0, double %1, double %2, i32 8)
6001  %4 = bitcast i8 %__U to <8 x i1>
6002  %5 = extractelement <8 x i1> %4, i64 0
6003  %6 = select i1 %5, double %3, double %2
6004  %7 = insertelement <2 x double> %__Y, double %6, i64 0
6005  ret <2 x double> %7
6006}
6007
6008define <2 x double> @test_mm_mask_fnmsub_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
6009; X86-LABEL: test_mm_mask_fnmsub_sd:
6010; X86:       # %bb.0: # %entry
6011; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
6012; X86-NEXT:    kmovw %eax, %k1
6013; X86-NEXT:    vfnmsub213sd {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2
6014; X86-NEXT:    retl
6015;
6016; X64-LABEL: test_mm_mask_fnmsub_sd:
6017; X64:       # %bb.0: # %entry
6018; X64-NEXT:    kmovw %edi, %k1
6019; X64-NEXT:    vfnmsub213sd {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2
6020; X64-NEXT:    retq
6021entry:
6022  %0 = extractelement <2 x double> %__W, i64 0
6023  %.rhs.i = extractelement <2 x double> %__A, i64 0
6024  %1 = fsub double -0.000000e+00, %.rhs.i
6025  %.rhs7.i = extractelement <2 x double> %__B, i64 0
6026  %2 = fsub double -0.000000e+00, %.rhs7.i
6027  %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #10
6028  %4 = and i8 %__U, 1
6029  %tobool.i = icmp eq i8 %4, 0
6030  %vecext2.i = extractelement <2 x double> %__W, i32 0
6031  %cond.i = select i1 %tobool.i, double %vecext2.i, double %3
6032  %vecins.i = insertelement <2 x double> %__W, double %cond.i, i32 0
6033  ret <2 x double> %vecins.i
6034}
6035
6036define <2 x double> @test_mm_mask_fnmsub_round_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
6037; X86-LABEL: test_mm_mask_fnmsub_round_sd:
6038; X86:       # %bb.0: # %entry
6039; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
6040; X86-NEXT:    kmovw %eax, %k1
6041; X86-NEXT:    vfnmsub213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1}
6042; X86-NEXT:    retl
6043;
6044; X64-LABEL: test_mm_mask_fnmsub_round_sd:
6045; X64:       # %bb.0: # %entry
6046; X64-NEXT:    kmovw %edi, %k1
6047; X64-NEXT:    vfnmsub213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1}
6048; X64-NEXT:    retq
6049entry:
6050  %0 = extractelement <2 x double> %__W, i64 0
6051  %.rhs = extractelement <2 x double> %__A, i64 0
6052  %1 = fsub double -0.000000e+00, %.rhs
6053  %.rhs2 = extractelement <2 x double> %__B, i64 0
6054  %2 = fsub double -0.000000e+00, %.rhs2
6055  %3 = tail call double @llvm.x86.avx512.vfmadd.f64(double %0, double %1, double %2, i32 8)
6056  %4 = bitcast i8 %__U to <8 x i1>
6057  %5 = extractelement <8 x i1> %4, i64 0
6058  %6 = select i1 %5, double %3, double %0
6059  %7 = insertelement <2 x double> %__W, double %6, i64 0
6060  ret <2 x double> %7
6061}
6062
6063define <2 x double> @test_mm_maskz_fnmsub_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) {
6064; X86-LABEL: test_mm_maskz_fnmsub_sd:
6065; X86:       # %bb.0: # %entry
6066; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
6067; X86-NEXT:    kmovw %eax, %k1
6068; X86-NEXT:    vfnmsub213sd {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2
6069; X86-NEXT:    retl
6070;
6071; X64-LABEL: test_mm_maskz_fnmsub_sd:
6072; X64:       # %bb.0: # %entry
6073; X64-NEXT:    kmovw %edi, %k1
6074; X64-NEXT:    vfnmsub213sd {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2
6075; X64-NEXT:    retq
6076entry:
6077  %0 = extractelement <2 x double> %__A, i64 0
6078  %.rhs.i = extractelement <2 x double> %__B, i64 0
6079  %1 = fsub double -0.000000e+00, %.rhs.i
6080  %.rhs5.i = extractelement <2 x double> %__C, i64 0
6081  %2 = fsub double -0.000000e+00, %.rhs5.i
6082  %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #10
6083  %4 = and i8 %__U, 1
6084  %tobool.i = icmp eq i8 %4, 0
6085  %cond.i = select i1 %tobool.i, double 0.000000e+00, double %3
6086  %vecins.i = insertelement <2 x double> %__A, double %cond.i, i32 0
6087  ret <2 x double> %vecins.i
6088}
6089
6090define <2 x double> @test_mm_maskz_fnmsub_round_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) {
6091; X86-LABEL: test_mm_maskz_fnmsub_round_sd:
6092; X86:       # %bb.0: # %entry
6093; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
6094; X86-NEXT:    kmovw %eax, %k1
6095; X86-NEXT:    vfnmsub213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
6096; X86-NEXT:    retl
6097;
6098; X64-LABEL: test_mm_maskz_fnmsub_round_sd:
6099; X64:       # %bb.0: # %entry
6100; X64-NEXT:    kmovw %edi, %k1
6101; X64-NEXT:    vfnmsub213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
6102; X64-NEXT:    retq
6103entry:
6104  %0 = extractelement <2 x double> %__A, i64 0
6105  %.rhs = extractelement <2 x double> %__B, i64 0
6106  %1 = fsub double -0.000000e+00, %.rhs
6107  %.rhs2 = extractelement <2 x double> %__C, i64 0
6108  %2 = fsub double -0.000000e+00, %.rhs2
6109  %3 = tail call double @llvm.x86.avx512.vfmadd.f64(double %0, double %1, double %2, i32 8)
6110  %4 = bitcast i8 %__U to <8 x i1>
6111  %5 = extractelement <8 x i1> %4, i64 0
6112  %6 = select i1 %5, double %3, double 0.000000e+00
6113  %7 = insertelement <2 x double> %__A, double %6, i64 0
6114  ret <2 x double> %7
6115}
6116
6117define <2 x double> @test_mm_mask3_fnmsub_sd(<2 x double> %__W, <2 x double> %__X, <2 x double> %__Y, i8 zeroext %__U) {
6118; X86-LABEL: test_mm_mask3_fnmsub_sd:
6119; X86:       # %bb.0: # %entry
6120; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
6121; X86-NEXT:    kmovw %eax, %k1
6122; X86-NEXT:    vfnmsub231sd {{.*#+}} xmm2 = -(xmm0 * xmm1) - xmm2
6123; X86-NEXT:    vmovapd %xmm2, %xmm0
6124; X86-NEXT:    retl
6125;
6126; X64-LABEL: test_mm_mask3_fnmsub_sd:
6127; X64:       # %bb.0: # %entry
6128; X64-NEXT:    kmovw %edi, %k1
6129; X64-NEXT:    vfnmsub231sd {{.*#+}} xmm2 = -(xmm0 * xmm1) - xmm2
6130; X64-NEXT:    vmovapd %xmm2, %xmm0
6131; X64-NEXT:    retq
6132entry:
6133  %0 = extractelement <2 x double> %__W, i64 0
6134  %.rhs.i = extractelement <2 x double> %__X, i64 0
6135  %1 = fsub double -0.000000e+00, %.rhs.i
6136  %.rhs7.i = extractelement <2 x double> %__Y, i64 0
6137  %2 = fsub double -0.000000e+00, %.rhs7.i
6138  %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #10
6139  %4 = and i8 %__U, 1
6140  %tobool.i = icmp eq i8 %4, 0
6141  %vecext2.i = extractelement <2 x double> %__Y, i32 0
6142  %cond.i = select i1 %tobool.i, double %vecext2.i, double %3
6143  %vecins.i = insertelement <2 x double> %__Y, double %cond.i, i32 0
6144  ret <2 x double> %vecins.i
6145}
6146
6147define <2 x double> @test_mm_mask3_fnmsub_round_sd(<2 x double> %__W, <2 x double> %__X, <2 x double> %__Y, i8 zeroext %__U) {
6148; X86-LABEL: test_mm_mask3_fnmsub_round_sd:
6149; X86:       # %bb.0: # %entry
6150; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
6151; X86-NEXT:    kmovw %eax, %k1
6152; X86-NEXT:    vfnmsub231sd {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
6153; X86-NEXT:    vmovapd %xmm2, %xmm0
6154; X86-NEXT:    retl
6155;
6156; X64-LABEL: test_mm_mask3_fnmsub_round_sd:
6157; X64:       # %bb.0: # %entry
6158; X64-NEXT:    kmovw %edi, %k1
6159; X64-NEXT:    vfnmsub231sd {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
6160; X64-NEXT:    vmovapd %xmm2, %xmm0
6161; X64-NEXT:    retq
6162entry:
6163  %0 = extractelement <2 x double> %__W, i64 0
6164  %.rhs = extractelement <2 x double> %__X, i64 0
6165  %1 = fsub double -0.000000e+00, %.rhs
6166  %.rhs1 = extractelement <2 x double> %__Y, i64 0
6167  %2 = fsub double -0.000000e+00, %.rhs1
6168  %3 = tail call double @llvm.x86.avx512.vfmadd.f64(double %0, double %1, double %2, i32 8)
6169  %4 = bitcast i8 %__U to <8 x i1>
6170  %5 = extractelement <8 x i1> %4, i64 0
6171  %6 = select i1 %5, double %3, double %.rhs1
6172  %7 = insertelement <2 x double> %__Y, double %6, i64 0
6173  ret <2 x double> %7
6174}
6175
6176define <8 x i64> @test_mm512_mask_expandloadu_epi64(<8 x i64> %__W, i8 zeroext %__U, i8* readonly %__P) {
6177; X86-LABEL: test_mm512_mask_expandloadu_epi64:
6178; X86:       # %bb.0: # %entry
6179; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
6180; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
6181; X86-NEXT:    kmovw %ecx, %k1
6182; X86-NEXT:    vpexpandq (%eax), %zmm0 {%k1}
6183; X86-NEXT:    retl
6184;
6185; X64-LABEL: test_mm512_mask_expandloadu_epi64:
6186; X64:       # %bb.0: # %entry
6187; X64-NEXT:    kmovw %edi, %k1
6188; X64-NEXT:    vpexpandq (%rsi), %zmm0 {%k1}
6189; X64-NEXT:    retq
6190entry:
6191  %0 = bitcast i8* %__P to i64*
6192  %1 = bitcast i8 %__U to <8 x i1>
6193  %2 = tail call <8 x i64> @llvm.masked.expandload.v8i64(i64* %0, <8 x i1> %1, <8 x i64> %__W)
6194  ret <8 x i64> %2
6195}
6196
6197define <8 x i64> @test_mm512_maskz_expandloadu_epi64(i8 zeroext %__U, i8* readonly %__P) {
6198; X86-LABEL: test_mm512_maskz_expandloadu_epi64:
6199; X86:       # %bb.0: # %entry
6200; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
6201; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
6202; X86-NEXT:    kmovw %ecx, %k1
6203; X86-NEXT:    vpexpandq (%eax), %zmm0 {%k1} {z}
6204; X86-NEXT:    retl
6205;
6206; X64-LABEL: test_mm512_maskz_expandloadu_epi64:
6207; X64:       # %bb.0: # %entry
6208; X64-NEXT:    kmovw %edi, %k1
6209; X64-NEXT:    vpexpandq (%rsi), %zmm0 {%k1} {z}
6210; X64-NEXT:    retq
6211entry:
6212  %0 = bitcast i8* %__P to i64*
6213  %1 = bitcast i8 %__U to <8 x i1>
6214  %2 = tail call <8 x i64> @llvm.masked.expandload.v8i64(i64* %0, <8 x i1> %1, <8 x i64> zeroinitializer)
6215  ret <8 x i64> %2
6216}
6217
6218define <8 x double> @test_mm512_mask_expandloadu_pd(<8 x double> %__W, i8 zeroext %__U, i8* readonly %__P) {
6219; X86-LABEL: test_mm512_mask_expandloadu_pd:
6220; X86:       # %bb.0: # %entry
6221; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
6222; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
6223; X86-NEXT:    kmovw %ecx, %k1
6224; X86-NEXT:    vexpandpd (%eax), %zmm0 {%k1}
6225; X86-NEXT:    retl
6226;
6227; X64-LABEL: test_mm512_mask_expandloadu_pd:
6228; X64:       # %bb.0: # %entry
6229; X64-NEXT:    kmovw %edi, %k1
6230; X64-NEXT:    vexpandpd (%rsi), %zmm0 {%k1}
6231; X64-NEXT:    retq
6232entry:
6233  %0 = bitcast i8* %__P to double*
6234  %1 = bitcast i8 %__U to <8 x i1>
6235  %2 = tail call <8 x double> @llvm.masked.expandload.v8f64(double* %0, <8 x i1> %1, <8 x double> %__W)
6236  ret <8 x double> %2
6237}
6238
6239define <8 x double> @test_mm512_maskz_expandloadu_pd(i8 zeroext %__U, i8* readonly %__P) {
6240; X86-LABEL: test_mm512_maskz_expandloadu_pd:
6241; X86:       # %bb.0: # %entry
6242; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
6243; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
6244; X86-NEXT:    kmovw %ecx, %k1
6245; X86-NEXT:    vexpandpd (%eax), %zmm0 {%k1} {z}
6246; X86-NEXT:    retl
6247;
6248; X64-LABEL: test_mm512_maskz_expandloadu_pd:
6249; X64:       # %bb.0: # %entry
6250; X64-NEXT:    kmovw %edi, %k1
6251; X64-NEXT:    vexpandpd (%rsi), %zmm0 {%k1} {z}
6252; X64-NEXT:    retq
6253entry:
6254  %0 = bitcast i8* %__P to double*
6255  %1 = bitcast i8 %__U to <8 x i1>
6256  %2 = tail call <8 x double> @llvm.masked.expandload.v8f64(double* %0, <8 x i1> %1, <8 x double> zeroinitializer)
6257  ret <8 x double> %2
6258}
6259
6260define <8 x i64> @test_mm512_mask_expandloadu_epi32(<8 x i64> %__W, i16 zeroext %__U, i8* readonly %__P) {
6261; X86-LABEL: test_mm512_mask_expandloadu_epi32:
6262; X86:       # %bb.0: # %entry
6263; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
6264; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
6265; X86-NEXT:    vpexpandd (%eax), %zmm0 {%k1}
6266; X86-NEXT:    retl
6267;
6268; X64-LABEL: test_mm512_mask_expandloadu_epi32:
6269; X64:       # %bb.0: # %entry
6270; X64-NEXT:    kmovw %edi, %k1
6271; X64-NEXT:    vpexpandd (%rsi), %zmm0 {%k1}
6272; X64-NEXT:    retq
6273entry:
6274  %0 = bitcast <8 x i64> %__W to <16 x i32>
6275  %1 = bitcast i8* %__P to i32*
6276  %2 = bitcast i16 %__U to <16 x i1>
6277  %3 = tail call <16 x i32> @llvm.masked.expandload.v16i32(i32* %1, <16 x i1> %2, <16 x i32> %0) #11
6278  %4 = bitcast <16 x i32> %3 to <8 x i64>
6279  ret <8 x i64> %4
6280}
6281
6282define <8 x i64> @test_mm512_maskz_expandloadu_epi32(i16 zeroext %__U, i8* readonly %__P) {
6283; X86-LABEL: test_mm512_maskz_expandloadu_epi32:
6284; X86:       # %bb.0: # %entry
6285; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
6286; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
6287; X86-NEXT:    vpexpandd (%eax), %zmm0 {%k1} {z}
6288; X86-NEXT:    retl
6289;
6290; X64-LABEL: test_mm512_maskz_expandloadu_epi32:
6291; X64:       # %bb.0: # %entry
6292; X64-NEXT:    kmovw %edi, %k1
6293; X64-NEXT:    vpexpandd (%rsi), %zmm0 {%k1} {z}
6294; X64-NEXT:    retq
6295entry:
6296  %0 = bitcast i8* %__P to i32*
6297  %1 = bitcast i16 %__U to <16 x i1>
6298  %2 = tail call <16 x i32> @llvm.masked.expandload.v16i32(i32* %0, <16 x i1> %1, <16 x i32> zeroinitializer)
6299  %3 = bitcast <16 x i32> %2 to <8 x i64>
6300  ret <8 x i64> %3
6301}
6302
6303define <16 x float> @test_mm512_mask_expandloadu_ps(<16 x float> %__W, i16 zeroext %__U, i8* readonly %__P) {
6304; X86-LABEL: test_mm512_mask_expandloadu_ps:
6305; X86:       # %bb.0: # %entry
6306; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
6307; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
6308; X86-NEXT:    vexpandps (%eax), %zmm0 {%k1}
6309; X86-NEXT:    retl
6310;
6311; X64-LABEL: test_mm512_mask_expandloadu_ps:
6312; X64:       # %bb.0: # %entry
6313; X64-NEXT:    kmovw %edi, %k1
6314; X64-NEXT:    vexpandps (%rsi), %zmm0 {%k1}
6315; X64-NEXT:    retq
6316entry:
6317  %0 = bitcast i8* %__P to float*
6318  %1 = bitcast i16 %__U to <16 x i1>
6319  %2 = tail call <16 x float> @llvm.masked.expandload.v16f32(float* %0, <16 x i1> %1, <16 x float> %__W) #11
6320  ret <16 x float> %2
6321}
6322
6323define <16 x float> @test_mm512_maskz_expandloadu_ps(i16 zeroext %__U, i8* readonly %__P) {
6324; X86-LABEL: test_mm512_maskz_expandloadu_ps:
6325; X86:       # %bb.0: # %entry
6326; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
6327; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
6328; X86-NEXT:    vexpandps (%eax), %zmm0 {%k1} {z}
6329; X86-NEXT:    retl
6330;
6331; X64-LABEL: test_mm512_maskz_expandloadu_ps:
6332; X64:       # %bb.0: # %entry
6333; X64-NEXT:    kmovw %edi, %k1
6334; X64-NEXT:    vexpandps (%rsi), %zmm0 {%k1} {z}
6335; X64-NEXT:    retq
6336entry:
6337  %0 = bitcast i8* %__P to float*
6338  %1 = bitcast i16 %__U to <16 x i1>
6339  %2 = tail call <16 x float> @llvm.masked.expandload.v16f32(float* %0, <16 x i1> %1, <16 x float> zeroinitializer)
6340  ret <16 x float> %2
6341}
6342
6343define void @test_mm512_mask_compressstoreu_pd(i8* %__P, i8 zeroext %__U, <8 x double> %__A) {
6344; X86-LABEL: test_mm512_mask_compressstoreu_pd:
6345; X86:       # %bb.0: # %entry
6346; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
6347; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
6348; X86-NEXT:    kmovw %eax, %k1
6349; X86-NEXT:    vcompresspd %zmm0, (%ecx) {%k1}
6350; X86-NEXT:    vzeroupper
6351; X86-NEXT:    retl
6352;
6353; X64-LABEL: test_mm512_mask_compressstoreu_pd:
6354; X64:       # %bb.0: # %entry
6355; X64-NEXT:    kmovw %esi, %k1
6356; X64-NEXT:    vcompresspd %zmm0, (%rdi) {%k1}
6357; X64-NEXT:    vzeroupper
6358; X64-NEXT:    retq
6359entry:
6360  %0 = bitcast i8* %__P to double*
6361  %1 = bitcast i8 %__U to <8 x i1>
6362  tail call void @llvm.masked.compressstore.v8f64(<8 x double> %__A, double* %0, <8 x i1> %1)
6363  ret void
6364}
6365
6366define void @test_mm512_mask_compressstoreu_epi64(i8* %__P, i8 zeroext %__U, <8 x i64> %__A) {
6367; X86-LABEL: test_mm512_mask_compressstoreu_epi64:
6368; X86:       # %bb.0: # %entry
6369; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
6370; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
6371; X86-NEXT:    kmovw %eax, %k1
6372; X86-NEXT:    vpcompressq %zmm0, (%ecx) {%k1}
6373; X86-NEXT:    vzeroupper
6374; X86-NEXT:    retl
6375;
6376; X64-LABEL: test_mm512_mask_compressstoreu_epi64:
6377; X64:       # %bb.0: # %entry
6378; X64-NEXT:    kmovw %esi, %k1
6379; X64-NEXT:    vpcompressq %zmm0, (%rdi) {%k1}
6380; X64-NEXT:    vzeroupper
6381; X64-NEXT:    retq
6382entry:
6383  %0 = bitcast i8* %__P to i64*
6384  %1 = bitcast i8 %__U to <8 x i1>
6385  tail call void @llvm.masked.compressstore.v8i64(<8 x i64> %__A, i64* %0, <8 x i1> %1)
6386  ret void
6387}
6388
6389define void @test_mm512_mask_compressstoreu_ps(i8* %__P, i16 zeroext %__U, <16 x float> %__A) {
6390; X86-LABEL: test_mm512_mask_compressstoreu_ps:
6391; X86:       # %bb.0: # %entry
6392; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
6393; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
6394; X86-NEXT:    vcompressps %zmm0, (%eax) {%k1}
6395; X86-NEXT:    vzeroupper
6396; X86-NEXT:    retl
6397;
6398; X64-LABEL: test_mm512_mask_compressstoreu_ps:
6399; X64:       # %bb.0: # %entry
6400; X64-NEXT:    kmovw %esi, %k1
6401; X64-NEXT:    vcompressps %zmm0, (%rdi) {%k1}
6402; X64-NEXT:    vzeroupper
6403; X64-NEXT:    retq
6404entry:
6405  %0 = bitcast i8* %__P to float*
6406  %1 = bitcast i16 %__U to <16 x i1>
6407  tail call void @llvm.masked.compressstore.v16f32(<16 x float> %__A, float* %0, <16 x i1> %1)
6408  ret void
6409}
6410
6411define void @test_mm512_mask_compressstoreu_epi32(i8* %__P, i16 zeroext %__U, <8 x i64> %__A) {
6412; X86-LABEL: test_mm512_mask_compressstoreu_epi32:
6413; X86:       # %bb.0: # %entry
6414; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
6415; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
6416; X86-NEXT:    vpcompressd %zmm0, (%eax) {%k1}
6417; X86-NEXT:    vzeroupper
6418; X86-NEXT:    retl
6419;
6420; X64-LABEL: test_mm512_mask_compressstoreu_epi32:
6421; X64:       # %bb.0: # %entry
6422; X64-NEXT:    kmovw %esi, %k1
6423; X64-NEXT:    vpcompressd %zmm0, (%rdi) {%k1}
6424; X64-NEXT:    vzeroupper
6425; X64-NEXT:    retq
6426entry:
6427  %0 = bitcast <8 x i64> %__A to <16 x i32>
6428  %1 = bitcast i8* %__P to i32*
6429  %2 = bitcast i16 %__U to <16 x i1>
6430  tail call void @llvm.masked.compressstore.v16i32(<16 x i32> %0, i32* %1, <16 x i1> %2)
6431  ret void
6432}
6433
6434define i64 @test_mm512_reduce_add_epi64(<8 x i64> %__W) {
6435; X86-LABEL: test_mm512_reduce_add_epi64:
6436; X86:       # %bb.0: # %entry
6437; X86-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
6438; X86-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
6439; X86-NEXT:    vextracti128 $1, %ymm0, %xmm1
6440; X86-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
6441; X86-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
6442; X86-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
6443; X86-NEXT:    vmovd %xmm0, %eax
6444; X86-NEXT:    vpextrd $1, %xmm0, %edx
6445; X86-NEXT:    vzeroupper
6446; X86-NEXT:    retl
6447;
6448; X64-LABEL: test_mm512_reduce_add_epi64:
6449; X64:       # %bb.0: # %entry
6450; X64-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
6451; X64-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
6452; X64-NEXT:    vextracti128 $1, %ymm0, %xmm1
6453; X64-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
6454; X64-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
6455; X64-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
6456; X64-NEXT:    vmovq %xmm0, %rax
6457; X64-NEXT:    vzeroupper
6458; X64-NEXT:    retq
6459entry:
6460  %shuffle.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6461  %shuffle1.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
6462  %add.i = add <4 x i64> %shuffle.i, %shuffle1.i
6463  %shuffle2.i = shufflevector <4 x i64> %add.i, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
6464  %shuffle3.i = shufflevector <4 x i64> %add.i, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
6465  %add4.i = add <2 x i64> %shuffle2.i, %shuffle3.i
6466  %shuffle6.i = shufflevector <2 x i64> %add4.i, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
6467  %add7.i = add <2 x i64> %shuffle6.i, %add4.i
6468  %vecext.i = extractelement <2 x i64> %add7.i, i32 0
6469  ret i64 %vecext.i
6470}
6471
6472define i64 @test_mm512_reduce_mul_epi64(<8 x i64> %__W) {
6473; X86-LABEL: test_mm512_reduce_mul_epi64:
6474; X86:       # %bb.0: # %entry
6475; X86-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
6476; X86-NEXT:    vpsrlq $32, %ymm0, %ymm2
6477; X86-NEXT:    vpmuludq %ymm1, %ymm2, %ymm2
6478; X86-NEXT:    vpsrlq $32, %ymm1, %ymm3
6479; X86-NEXT:    vpmuludq %ymm3, %ymm0, %ymm3
6480; X86-NEXT:    vpaddq %ymm2, %ymm3, %ymm2
6481; X86-NEXT:    vpsllq $32, %ymm2, %ymm2
6482; X86-NEXT:    vpmuludq %ymm1, %ymm0, %ymm0
6483; X86-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
6484; X86-NEXT:    vextracti128 $1, %ymm0, %xmm1
6485; X86-NEXT:    vpsrlq $32, %xmm0, %xmm2
6486; X86-NEXT:    vpmuludq %xmm1, %xmm2, %xmm2
6487; X86-NEXT:    vpsrlq $32, %xmm1, %xmm3
6488; X86-NEXT:    vpmuludq %xmm3, %xmm0, %xmm3
6489; X86-NEXT:    vpaddq %xmm2, %xmm3, %xmm2
6490; X86-NEXT:    vpsllq $32, %xmm2, %xmm2
6491; X86-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
6492; X86-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
6493; X86-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
6494; X86-NEXT:    vpsrlq $32, %xmm0, %xmm2
6495; X86-NEXT:    vpmuludq %xmm2, %xmm1, %xmm2
6496; X86-NEXT:    vpsrlq $32, %xmm1, %xmm3
6497; X86-NEXT:    vpmuludq %xmm0, %xmm3, %xmm3
6498; X86-NEXT:    vpaddq %xmm3, %xmm2, %xmm2
6499; X86-NEXT:    vpsllq $32, %xmm2, %xmm2
6500; X86-NEXT:    vpmuludq %xmm0, %xmm1, %xmm0
6501; X86-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
6502; X86-NEXT:    vmovd %xmm0, %eax
6503; X86-NEXT:    vpextrd $1, %xmm0, %edx
6504; X86-NEXT:    vzeroupper
6505; X86-NEXT:    retl
6506;
6507; X64-LABEL: test_mm512_reduce_mul_epi64:
6508; X64:       # %bb.0: # %entry
6509; X64-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
6510; X64-NEXT:    vpsrlq $32, %ymm0, %ymm2
6511; X64-NEXT:    vpmuludq %ymm1, %ymm2, %ymm2
6512; X64-NEXT:    vpsrlq $32, %ymm1, %ymm3
6513; X64-NEXT:    vpmuludq %ymm3, %ymm0, %ymm3
6514; X64-NEXT:    vpaddq %ymm2, %ymm3, %ymm2
6515; X64-NEXT:    vpsllq $32, %ymm2, %ymm2
6516; X64-NEXT:    vpmuludq %ymm1, %ymm0, %ymm0
6517; X64-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
6518; X64-NEXT:    vextracti128 $1, %ymm0, %xmm1
6519; X64-NEXT:    vpsrlq $32, %xmm0, %xmm2
6520; X64-NEXT:    vpmuludq %xmm1, %xmm2, %xmm2
6521; X64-NEXT:    vpsrlq $32, %xmm1, %xmm3
6522; X64-NEXT:    vpmuludq %xmm3, %xmm0, %xmm3
6523; X64-NEXT:    vpaddq %xmm2, %xmm3, %xmm2
6524; X64-NEXT:    vpsllq $32, %xmm2, %xmm2
6525; X64-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
6526; X64-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
6527; X64-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
6528; X64-NEXT:    vpsrlq $32, %xmm0, %xmm2
6529; X64-NEXT:    vpmuludq %xmm2, %xmm1, %xmm2
6530; X64-NEXT:    vpsrlq $32, %xmm1, %xmm3
6531; X64-NEXT:    vpmuludq %xmm0, %xmm3, %xmm3
6532; X64-NEXT:    vpaddq %xmm3, %xmm2, %xmm2
6533; X64-NEXT:    vpsllq $32, %xmm2, %xmm2
6534; X64-NEXT:    vpmuludq %xmm0, %xmm1, %xmm0
6535; X64-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
6536; X64-NEXT:    vmovq %xmm0, %rax
6537; X64-NEXT:    vzeroupper
6538; X64-NEXT:    retq
6539entry:
6540  %shuffle.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6541  %shuffle1.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
6542  %mul.i = mul <4 x i64> %shuffle.i, %shuffle1.i
6543  %shuffle2.i = shufflevector <4 x i64> %mul.i, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
6544  %shuffle3.i = shufflevector <4 x i64> %mul.i, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
6545  %mul4.i = mul <2 x i64> %shuffle2.i, %shuffle3.i
6546  %shuffle6.i = shufflevector <2 x i64> %mul4.i, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
6547  %mul7.i = mul <2 x i64> %shuffle6.i, %mul4.i
6548  %vecext.i = extractelement <2 x i64> %mul7.i, i32 0
6549  ret i64 %vecext.i
6550}
6551
6552define i64 @test_mm512_reduce_or_epi64(<8 x i64> %__W) {
6553; X86-LABEL: test_mm512_reduce_or_epi64:
6554; X86:       # %bb.0: # %entry
6555; X86-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
6556; X86-NEXT:    vpor %ymm1, %ymm0, %ymm0
6557; X86-NEXT:    vextracti128 $1, %ymm0, %xmm1
6558; X86-NEXT:    vpor %xmm1, %xmm0, %xmm0
6559; X86-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
6560; X86-NEXT:    vpor %xmm0, %xmm1, %xmm0
6561; X86-NEXT:    vmovd %xmm0, %eax
6562; X86-NEXT:    vpextrd $1, %xmm0, %edx
6563; X86-NEXT:    vzeroupper
6564; X86-NEXT:    retl
6565;
6566; X64-LABEL: test_mm512_reduce_or_epi64:
6567; X64:       # %bb.0: # %entry
6568; X64-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
6569; X64-NEXT:    vpor %ymm1, %ymm0, %ymm0
6570; X64-NEXT:    vextracti128 $1, %ymm0, %xmm1
6571; X64-NEXT:    vpor %xmm1, %xmm0, %xmm0
6572; X64-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
6573; X64-NEXT:    vpor %xmm0, %xmm1, %xmm0
6574; X64-NEXT:    vmovq %xmm0, %rax
6575; X64-NEXT:    vzeroupper
6576; X64-NEXT:    retq
6577entry:
6578  %shuffle.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6579  %shuffle1.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
6580  %or.i = or <4 x i64> %shuffle.i, %shuffle1.i
6581  %shuffle2.i = shufflevector <4 x i64> %or.i, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
6582  %shuffle3.i = shufflevector <4 x i64> %or.i, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
6583  %or4.i = or <2 x i64> %shuffle2.i, %shuffle3.i
6584  %shuffle6.i = shufflevector <2 x i64> %or4.i, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
6585  %or7.i = or <2 x i64> %shuffle6.i, %or4.i
6586  %vecext.i = extractelement <2 x i64> %or7.i, i32 0
6587  ret i64 %vecext.i
6588}
6589
6590define i64 @test_mm512_reduce_and_epi64(<8 x i64> %__W) {
6591; X86-LABEL: test_mm512_reduce_and_epi64:
6592; X86:       # %bb.0: # %entry
6593; X86-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
6594; X86-NEXT:    vpand %ymm1, %ymm0, %ymm0
6595; X86-NEXT:    vextracti128 $1, %ymm0, %xmm1
6596; X86-NEXT:    vpand %xmm1, %xmm0, %xmm0
6597; X86-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
6598; X86-NEXT:    vpand %xmm0, %xmm1, %xmm0
6599; X86-NEXT:    vmovd %xmm0, %eax
6600; X86-NEXT:    vpextrd $1, %xmm0, %edx
6601; X86-NEXT:    vzeroupper
6602; X86-NEXT:    retl
6603;
6604; X64-LABEL: test_mm512_reduce_and_epi64:
6605; X64:       # %bb.0: # %entry
6606; X64-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
6607; X64-NEXT:    vpand %ymm1, %ymm0, %ymm0
6608; X64-NEXT:    vextracti128 $1, %ymm0, %xmm1
6609; X64-NEXT:    vpand %xmm1, %xmm0, %xmm0
6610; X64-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
6611; X64-NEXT:    vpand %xmm0, %xmm1, %xmm0
6612; X64-NEXT:    vmovq %xmm0, %rax
6613; X64-NEXT:    vzeroupper
6614; X64-NEXT:    retq
6615entry:
6616  %shuffle.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6617  %shuffle1.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
6618  %and.i = and <4 x i64> %shuffle.i, %shuffle1.i
6619  %shuffle2.i = shufflevector <4 x i64> %and.i, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
6620  %shuffle3.i = shufflevector <4 x i64> %and.i, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
6621  %and4.i = and <2 x i64> %shuffle2.i, %shuffle3.i
6622  %shuffle6.i = shufflevector <2 x i64> %and4.i, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
6623  %and7.i = and <2 x i64> %shuffle6.i, %and4.i
6624  %vecext.i = extractelement <2 x i64> %and7.i, i32 0
6625  ret i64 %vecext.i
6626}
6627
6628define i64 @test_mm512_mask_reduce_add_epi64(i8 zeroext %__M, <8 x i64> %__W) {
6629; X86-LABEL: test_mm512_mask_reduce_add_epi64:
6630; X86:       # %bb.0: # %entry
6631; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
6632; X86-NEXT:    kmovw %eax, %k1
6633; X86-NEXT:    vmovdqa64 %zmm0, %zmm0 {%k1} {z}
6634; X86-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
6635; X86-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
6636; X86-NEXT:    vextracti128 $1, %ymm0, %xmm1
6637; X86-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
6638; X86-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
6639; X86-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
6640; X86-NEXT:    vmovd %xmm0, %eax
6641; X86-NEXT:    vpextrd $1, %xmm0, %edx
6642; X86-NEXT:    vzeroupper
6643; X86-NEXT:    retl
6644;
6645; X64-LABEL: test_mm512_mask_reduce_add_epi64:
6646; X64:       # %bb.0: # %entry
6647; X64-NEXT:    kmovw %edi, %k1
6648; X64-NEXT:    vmovdqa64 %zmm0, %zmm0 {%k1} {z}
6649; X64-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
6650; X64-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
6651; X64-NEXT:    vextracti128 $1, %ymm0, %xmm1
6652; X64-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
6653; X64-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
6654; X64-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
6655; X64-NEXT:    vmovq %xmm0, %rax
6656; X64-NEXT:    vzeroupper
6657; X64-NEXT:    retq
6658entry:
6659  %0 = bitcast i8 %__M to <8 x i1>
6660  %1 = select <8 x i1> %0, <8 x i64> %__W, <8 x i64> zeroinitializer
6661  %shuffle.i = shufflevector <8 x i64> %1, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6662  %shuffle1.i = shufflevector <8 x i64> %1, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
6663  %add.i = add <4 x i64> %shuffle.i, %shuffle1.i
6664  %shuffle2.i = shufflevector <4 x i64> %add.i, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
6665  %shuffle3.i = shufflevector <4 x i64> %add.i, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
6666  %add4.i = add <2 x i64> %shuffle2.i, %shuffle3.i
6667  %shuffle6.i = shufflevector <2 x i64> %add4.i, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
6668  %add7.i = add <2 x i64> %shuffle6.i, %add4.i
6669  %vecext.i = extractelement <2 x i64> %add7.i, i32 0
6670  ret i64 %vecext.i
6671}
6672
6673define i64 @test_mm512_mask_reduce_mul_epi64(i8 zeroext %__M, <8 x i64> %__W) {
6674; X86-LABEL: test_mm512_mask_reduce_mul_epi64:
6675; X86:       # %bb.0: # %entry
6676; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
6677; X86-NEXT:    kmovw %eax, %k1
6678; X86-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0]
6679; X86-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
6680; X86-NEXT:    vextracti64x4 $1, %zmm1, %ymm0
6681; X86-NEXT:    vpsrlq $32, %ymm1, %ymm2
6682; X86-NEXT:    vpmuludq %ymm0, %ymm2, %ymm2
6683; X86-NEXT:    vpsrlq $32, %ymm0, %ymm3
6684; X86-NEXT:    vpmuludq %ymm3, %ymm1, %ymm3
6685; X86-NEXT:    vpaddq %ymm2, %ymm3, %ymm2
6686; X86-NEXT:    vpsllq $32, %ymm2, %ymm2
6687; X86-NEXT:    vpmuludq %ymm0, %ymm1, %ymm0
6688; X86-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
6689; X86-NEXT:    vextracti128 $1, %ymm0, %xmm1
6690; X86-NEXT:    vpsrlq $32, %xmm0, %xmm2
6691; X86-NEXT:    vpmuludq %xmm1, %xmm2, %xmm2
6692; X86-NEXT:    vpsrlq $32, %xmm1, %xmm3
6693; X86-NEXT:    vpmuludq %xmm3, %xmm0, %xmm3
6694; X86-NEXT:    vpaddq %xmm2, %xmm3, %xmm2
6695; X86-NEXT:    vpsllq $32, %xmm2, %xmm2
6696; X86-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
6697; X86-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
6698; X86-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
6699; X86-NEXT:    vpsrlq $32, %xmm0, %xmm2
6700; X86-NEXT:    vpmuludq %xmm2, %xmm1, %xmm2
6701; X86-NEXT:    vpsrlq $32, %xmm1, %xmm3
6702; X86-NEXT:    vpmuludq %xmm0, %xmm3, %xmm3
6703; X86-NEXT:    vpaddq %xmm3, %xmm2, %xmm2
6704; X86-NEXT:    vpsllq $32, %xmm2, %xmm2
6705; X86-NEXT:    vpmuludq %xmm0, %xmm1, %xmm0
6706; X86-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
6707; X86-NEXT:    vmovd %xmm0, %eax
6708; X86-NEXT:    vpextrd $1, %xmm0, %edx
6709; X86-NEXT:    vzeroupper
6710; X86-NEXT:    retl
6711;
6712; X64-LABEL: test_mm512_mask_reduce_mul_epi64:
6713; X64:       # %bb.0: # %entry
6714; X64-NEXT:    kmovw %edi, %k1
6715; X64-NEXT:    vpbroadcastq {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1]
6716; X64-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
6717; X64-NEXT:    vextracti64x4 $1, %zmm1, %ymm0
6718; X64-NEXT:    vpsrlq $32, %ymm1, %ymm2
6719; X64-NEXT:    vpmuludq %ymm0, %ymm2, %ymm2
6720; X64-NEXT:    vpsrlq $32, %ymm0, %ymm3
6721; X64-NEXT:    vpmuludq %ymm3, %ymm1, %ymm3
6722; X64-NEXT:    vpaddq %ymm2, %ymm3, %ymm2
6723; X64-NEXT:    vpsllq $32, %ymm2, %ymm2
6724; X64-NEXT:    vpmuludq %ymm0, %ymm1, %ymm0
6725; X64-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
6726; X64-NEXT:    vextracti128 $1, %ymm0, %xmm1
6727; X64-NEXT:    vpsrlq $32, %xmm0, %xmm2
6728; X64-NEXT:    vpmuludq %xmm1, %xmm2, %xmm2
6729; X64-NEXT:    vpsrlq $32, %xmm1, %xmm3
6730; X64-NEXT:    vpmuludq %xmm3, %xmm0, %xmm3
6731; X64-NEXT:    vpaddq %xmm2, %xmm3, %xmm2
6732; X64-NEXT:    vpsllq $32, %xmm2, %xmm2
6733; X64-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
6734; X64-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
6735; X64-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
6736; X64-NEXT:    vpsrlq $32, %xmm0, %xmm2
6737; X64-NEXT:    vpmuludq %xmm2, %xmm1, %xmm2
6738; X64-NEXT:    vpsrlq $32, %xmm1, %xmm3
6739; X64-NEXT:    vpmuludq %xmm0, %xmm3, %xmm3
6740; X64-NEXT:    vpaddq %xmm3, %xmm2, %xmm2
6741; X64-NEXT:    vpsllq $32, %xmm2, %xmm2
6742; X64-NEXT:    vpmuludq %xmm0, %xmm1, %xmm0
6743; X64-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
6744; X64-NEXT:    vmovq %xmm0, %rax
6745; X64-NEXT:    vzeroupper
6746; X64-NEXT:    retq
6747entry:
6748  %0 = bitcast i8 %__M to <8 x i1>
6749  %1 = select <8 x i1> %0, <8 x i64> %__W, <8 x i64> <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
6750  %shuffle.i = shufflevector <8 x i64> %1, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6751  %shuffle1.i = shufflevector <8 x i64> %1, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
6752  %mul.i = mul <4 x i64> %shuffle.i, %shuffle1.i
6753  %shuffle2.i = shufflevector <4 x i64> %mul.i, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
6754  %shuffle3.i = shufflevector <4 x i64> %mul.i, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
6755  %mul4.i = mul <2 x i64> %shuffle2.i, %shuffle3.i
6756  %shuffle6.i = shufflevector <2 x i64> %mul4.i, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
6757  %mul7.i = mul <2 x i64> %shuffle6.i, %mul4.i
6758  %vecext.i = extractelement <2 x i64> %mul7.i, i32 0
6759  ret i64 %vecext.i
6760}
6761
6762define i64 @test_mm512_mask_reduce_and_epi64(i8 zeroext %__M, <8 x i64> %__W) {
6763; X86-LABEL: test_mm512_mask_reduce_and_epi64:
6764; X86:       # %bb.0: # %entry
6765; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
6766; X86-NEXT:    kmovw %eax, %k1
6767; X86-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
6768; X86-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
6769; X86-NEXT:    vextracti64x4 $1, %zmm1, %ymm0
6770; X86-NEXT:    vpand %ymm0, %ymm1, %ymm0
6771; X86-NEXT:    vextracti128 $1, %ymm0, %xmm1
6772; X86-NEXT:    vpand %xmm1, %xmm0, %xmm0
6773; X86-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
6774; X86-NEXT:    vpand %xmm0, %xmm1, %xmm0
6775; X86-NEXT:    vmovd %xmm0, %eax
6776; X86-NEXT:    vpextrd $1, %xmm0, %edx
6777; X86-NEXT:    vzeroupper
6778; X86-NEXT:    retl
6779;
6780; X64-LABEL: test_mm512_mask_reduce_and_epi64:
6781; X64:       # %bb.0: # %entry
6782; X64-NEXT:    kmovw %edi, %k1
6783; X64-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
6784; X64-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
6785; X64-NEXT:    vextracti64x4 $1, %zmm1, %ymm0
6786; X64-NEXT:    vpand %ymm0, %ymm1, %ymm0
6787; X64-NEXT:    vextracti128 $1, %ymm0, %xmm1
6788; X64-NEXT:    vpand %xmm1, %xmm0, %xmm0
6789; X64-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
6790; X64-NEXT:    vpand %xmm0, %xmm1, %xmm0
6791; X64-NEXT:    vmovq %xmm0, %rax
6792; X64-NEXT:    vzeroupper
6793; X64-NEXT:    retq
6794entry:
6795  %0 = bitcast i8 %__M to <8 x i1>
6796  %1 = select <8 x i1> %0, <8 x i64> %__W, <8 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>
6797  %shuffle.i = shufflevector <8 x i64> %1, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6798  %shuffle1.i = shufflevector <8 x i64> %1, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
6799  %and.i = and <4 x i64> %shuffle.i, %shuffle1.i
6800  %shuffle2.i = shufflevector <4 x i64> %and.i, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
6801  %shuffle3.i = shufflevector <4 x i64> %and.i, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
6802  %and4.i = and <2 x i64> %shuffle2.i, %shuffle3.i
6803  %shuffle6.i = shufflevector <2 x i64> %and4.i, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
6804  %and7.i = and <2 x i64> %shuffle6.i, %and4.i
6805  %vecext.i = extractelement <2 x i64> %and7.i, i32 0
6806  ret i64 %vecext.i
6807}
6808
6809define i64 @test_mm512_mask_reduce_or_epi64(i8 zeroext %__M, <8 x i64> %__W) {
6810; X86-LABEL: test_mm512_mask_reduce_or_epi64:
6811; X86:       # %bb.0: # %entry
6812; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
6813; X86-NEXT:    kmovw %eax, %k1
6814; X86-NEXT:    vmovdqa64 %zmm0, %zmm0 {%k1} {z}
6815; X86-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
6816; X86-NEXT:    vpor %ymm1, %ymm0, %ymm0
6817; X86-NEXT:    vextracti128 $1, %ymm0, %xmm1
6818; X86-NEXT:    vpor %xmm1, %xmm0, %xmm0
6819; X86-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
6820; X86-NEXT:    vpor %xmm0, %xmm1, %xmm0
6821; X86-NEXT:    vmovd %xmm0, %eax
6822; X86-NEXT:    vpextrd $1, %xmm0, %edx
6823; X86-NEXT:    vzeroupper
6824; X86-NEXT:    retl
6825;
6826; X64-LABEL: test_mm512_mask_reduce_or_epi64:
6827; X64:       # %bb.0: # %entry
6828; X64-NEXT:    kmovw %edi, %k1
6829; X64-NEXT:    vmovdqa64 %zmm0, %zmm0 {%k1} {z}
6830; X64-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
6831; X64-NEXT:    vpor %ymm1, %ymm0, %ymm0
6832; X64-NEXT:    vextracti128 $1, %ymm0, %xmm1
6833; X64-NEXT:    vpor %xmm1, %xmm0, %xmm0
6834; X64-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
6835; X64-NEXT:    vpor %xmm0, %xmm1, %xmm0
6836; X64-NEXT:    vmovq %xmm0, %rax
6837; X64-NEXT:    vzeroupper
6838; X64-NEXT:    retq
6839entry:
6840  %0 = bitcast i8 %__M to <8 x i1>
6841  %1 = select <8 x i1> %0, <8 x i64> %__W, <8 x i64> zeroinitializer
6842  %shuffle.i = shufflevector <8 x i64> %1, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6843  %shuffle1.i = shufflevector <8 x i64> %1, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
6844  %or.i = or <4 x i64> %shuffle.i, %shuffle1.i
6845  %shuffle2.i = shufflevector <4 x i64> %or.i, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
6846  %shuffle3.i = shufflevector <4 x i64> %or.i, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
6847  %or4.i = or <2 x i64> %shuffle2.i, %shuffle3.i
6848  %shuffle6.i = shufflevector <2 x i64> %or4.i, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
6849  %or7.i = or <2 x i64> %shuffle6.i, %or4.i
6850  %vecext.i = extractelement <2 x i64> %or7.i, i32 0
6851  ret i64 %vecext.i
6852}
6853
6854define i32 @test_mm512_reduce_add_epi32(<8 x i64> %__W) {
6855; CHECK-LABEL: test_mm512_reduce_add_epi32:
6856; CHECK:       # %bb.0: # %entry
6857; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
6858; CHECK-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
6859; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm1
6860; CHECK-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
6861; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
6862; CHECK-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
6863; CHECK-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
6864; CHECK-NEXT:    vmovd %xmm0, %eax
6865; CHECK-NEXT:    vzeroupper
6866; CHECK-NEXT:    ret{{[l|q]}}
6867entry:
6868  %extract.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6869  %0 = bitcast <4 x i64> %extract.i to <8 x i32>
6870  %extract2.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
6871  %1 = bitcast <4 x i64> %extract2.i to <8 x i32>
6872  %add.i = add <8 x i32> %0, %1
6873  %2 = bitcast <8 x i32> %add.i to <4 x i64>
6874  %extract3.i = shufflevector <4 x i64> %2, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
6875  %3 = bitcast <2 x i64> %extract3.i to <4 x i32>
6876  %extract4.i = shufflevector <4 x i64> %2, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
6877  %4 = bitcast <2 x i64> %extract4.i to <4 x i32>
6878  %add5.i = add <4 x i32> %3, %4
6879  %shuffle.i = shufflevector <4 x i32> %add5.i, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
6880  %add6.i = add <4 x i32> %shuffle.i, %add5.i
6881  %shuffle7.i = shufflevector <4 x i32> %add6.i, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
6882  %add8.i = add <4 x i32> %shuffle7.i, %add6.i
6883  %vecext.i = extractelement <4 x i32> %add8.i, i32 0
6884  ret i32 %vecext.i
6885}
6886
6887define i32 @test_mm512_reduce_mul_epi32(<8 x i64> %__W) {
6888; CHECK-LABEL: test_mm512_reduce_mul_epi32:
6889; CHECK:       # %bb.0: # %entry
6890; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
6891; CHECK-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
6892; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm1
6893; CHECK-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
6894; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
6895; CHECK-NEXT:    vpmulld %xmm0, %xmm1, %xmm0
6896; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
6897; CHECK-NEXT:    vpmulld %xmm0, %xmm1, %xmm0
6898; CHECK-NEXT:    vmovd %xmm0, %eax
6899; CHECK-NEXT:    vzeroupper
6900; CHECK-NEXT:    ret{{[l|q]}}
6901entry:
6902  %extract.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6903  %0 = bitcast <4 x i64> %extract.i to <8 x i32>
6904  %extract2.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
6905  %1 = bitcast <4 x i64> %extract2.i to <8 x i32>
6906  %mul.i = mul <8 x i32> %0, %1
6907  %2 = bitcast <8 x i32> %mul.i to <4 x i64>
6908  %extract3.i = shufflevector <4 x i64> %2, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
6909  %3 = bitcast <2 x i64> %extract3.i to <4 x i32>
6910  %extract4.i = shufflevector <4 x i64> %2, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
6911  %4 = bitcast <2 x i64> %extract4.i to <4 x i32>
6912  %mul5.i = mul <4 x i32> %3, %4
6913  %shuffle.i = shufflevector <4 x i32> %mul5.i, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
6914  %mul6.i = mul <4 x i32> %shuffle.i, %mul5.i
6915  %shuffle7.i = shufflevector <4 x i32> %mul6.i, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
6916  %mul8.i = mul <4 x i32> %shuffle7.i, %mul6.i
6917  %vecext.i = extractelement <4 x i32> %mul8.i, i32 0
6918  ret i32 %vecext.i
6919}
6920
6921define i32 @test_mm512_reduce_or_epi32(<8 x i64> %__W) {
6922; CHECK-LABEL: test_mm512_reduce_or_epi32:
6923; CHECK:       # %bb.0: # %entry
6924; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
6925; CHECK-NEXT:    vpor %ymm1, %ymm0, %ymm0
6926; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm1
6927; CHECK-NEXT:    vpor %xmm1, %xmm0, %xmm0
6928; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
6929; CHECK-NEXT:    vpor %xmm0, %xmm1, %xmm0
6930; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
6931; CHECK-NEXT:    vpor %xmm0, %xmm1, %xmm0
6932; CHECK-NEXT:    vmovd %xmm0, %eax
6933; CHECK-NEXT:    vzeroupper
6934; CHECK-NEXT:    ret{{[l|q]}}
6935entry:
6936  %extract.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6937  %extract2.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
6938  %or25.i = or <4 x i64> %extract.i, %extract2.i
6939  %extract3.i = shufflevector <4 x i64> %or25.i, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
6940  %extract4.i = shufflevector <4 x i64> %or25.i, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
6941  %or526.i = or <2 x i64> %extract3.i, %extract4.i
6942  %or5.i = bitcast <2 x i64> %or526.i to <4 x i32>
6943  %shuffle.i = shufflevector <4 x i32> %or5.i, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
6944  %or6.i = or <4 x i32> %shuffle.i, %or5.i
6945  %shuffle7.i = shufflevector <4 x i32> %or6.i, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
6946  %or8.i = or <4 x i32> %shuffle7.i, %or6.i
6947  %vecext.i = extractelement <4 x i32> %or8.i, i32 0
6948  ret i32 %vecext.i
6949}
6950
6951define i32 @test_mm512_reduce_and_epi32(<8 x i64> %__W) {
6952; CHECK-LABEL: test_mm512_reduce_and_epi32:
6953; CHECK:       # %bb.0: # %entry
6954; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
6955; CHECK-NEXT:    vpand %ymm1, %ymm0, %ymm0
6956; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm1
6957; CHECK-NEXT:    vpand %xmm1, %xmm0, %xmm0
6958; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
6959; CHECK-NEXT:    vpand %xmm0, %xmm1, %xmm0
6960; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
6961; CHECK-NEXT:    vpand %xmm0, %xmm1, %xmm0
6962; CHECK-NEXT:    vmovd %xmm0, %eax
6963; CHECK-NEXT:    vzeroupper
6964; CHECK-NEXT:    ret{{[l|q]}}
6965entry:
6966  %extract.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6967  %extract2.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
6968  %and25.i = and <4 x i64> %extract.i, %extract2.i
6969  %extract3.i = shufflevector <4 x i64> %and25.i, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
6970  %extract4.i = shufflevector <4 x i64> %and25.i, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
6971  %and526.i = and <2 x i64> %extract3.i, %extract4.i
6972  %and5.i = bitcast <2 x i64> %and526.i to <4 x i32>
6973  %shuffle.i = shufflevector <4 x i32> %and5.i, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
6974  %and6.i = and <4 x i32> %shuffle.i, %and5.i
6975  %shuffle7.i = shufflevector <4 x i32> %and6.i, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
6976  %and8.i = and <4 x i32> %shuffle7.i, %and6.i
6977  %vecext.i = extractelement <4 x i32> %and8.i, i32 0
6978  ret i32 %vecext.i
6979}
6980
6981define i32 @test_mm512_mask_reduce_add_epi32(i16 zeroext %__M, <8 x i64> %__W) {
6982; X86-LABEL: test_mm512_mask_reduce_add_epi32:
6983; X86:       # %bb.0: # %entry
6984; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
6985; X86-NEXT:    vmovdqa32 %zmm0, %zmm0 {%k1} {z}
6986; X86-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
6987; X86-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
6988; X86-NEXT:    vextracti128 $1, %ymm0, %xmm1
6989; X86-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
6990; X86-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
6991; X86-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
6992; X86-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
6993; X86-NEXT:    vmovd %xmm0, %eax
6994; X86-NEXT:    vzeroupper
6995; X86-NEXT:    retl
6996;
6997; X64-LABEL: test_mm512_mask_reduce_add_epi32:
6998; X64:       # %bb.0: # %entry
6999; X64-NEXT:    kmovw %edi, %k1
7000; X64-NEXT:    vmovdqa32 %zmm0, %zmm0 {%k1} {z}
7001; X64-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
7002; X64-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
7003; X64-NEXT:    vextracti128 $1, %ymm0, %xmm1
7004; X64-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
7005; X64-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
7006; X64-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
7007; X64-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
7008; X64-NEXT:    vmovd %xmm0, %eax
7009; X64-NEXT:    vzeroupper
7010; X64-NEXT:    retq
7011entry:
7012  %0 = bitcast <8 x i64> %__W to <16 x i32>
7013  %1 = bitcast i16 %__M to <16 x i1>
7014  %2 = select <16 x i1> %1, <16 x i32> %0, <16 x i32> zeroinitializer
7015  %3 = bitcast <16 x i32> %2 to <8 x i64>
7016  %extract.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7017  %4 = bitcast <4 x i64> %extract.i to <8 x i32>
7018  %extract3.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
7019  %5 = bitcast <4 x i64> %extract3.i to <8 x i32>
7020  %add.i = add <8 x i32> %4, %5
7021  %6 = bitcast <8 x i32> %add.i to <4 x i64>
7022  %extract4.i = shufflevector <4 x i64> %6, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
7023  %7 = bitcast <2 x i64> %extract4.i to <4 x i32>
7024  %extract5.i = shufflevector <4 x i64> %6, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
7025  %8 = bitcast <2 x i64> %extract5.i to <4 x i32>
7026  %add6.i = add <4 x i32> %7, %8
7027  %shuffle.i = shufflevector <4 x i32> %add6.i, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
7028  %add7.i = add <4 x i32> %shuffle.i, %add6.i
7029  %shuffle8.i = shufflevector <4 x i32> %add7.i, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
7030  %add9.i = add <4 x i32> %shuffle8.i, %add7.i
7031  %vecext.i = extractelement <4 x i32> %add9.i, i32 0
7032  ret i32 %vecext.i
7033}
7034
7035define i32 @test_mm512_mask_reduce_mul_epi32(i16 zeroext %__M, <8 x i64> %__W) {
7036; X86-LABEL: test_mm512_mask_reduce_mul_epi32:
7037; X86:       # %bb.0: # %entry
7038; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
7039; X86-NEXT:    vpbroadcastd {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
7040; X86-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
7041; X86-NEXT:    vextracti64x4 $1, %zmm1, %ymm0
7042; X86-NEXT:    vpmulld %ymm0, %ymm1, %ymm0
7043; X86-NEXT:    vextracti128 $1, %ymm0, %xmm1
7044; X86-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
7045; X86-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
7046; X86-NEXT:    vpmulld %xmm0, %xmm1, %xmm0
7047; X86-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
7048; X86-NEXT:    vpmulld %xmm0, %xmm1, %xmm0
7049; X86-NEXT:    vmovd %xmm0, %eax
7050; X86-NEXT:    vzeroupper
7051; X86-NEXT:    retl
7052;
7053; X64-LABEL: test_mm512_mask_reduce_mul_epi32:
7054; X64:       # %bb.0: # %entry
7055; X64-NEXT:    kmovw %edi, %k1
7056; X64-NEXT:    vpbroadcastd {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
7057; X64-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
7058; X64-NEXT:    vextracti64x4 $1, %zmm1, %ymm0
7059; X64-NEXT:    vpmulld %ymm0, %ymm1, %ymm0
7060; X64-NEXT:    vextracti128 $1, %ymm0, %xmm1
7061; X64-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
7062; X64-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
7063; X64-NEXT:    vpmulld %xmm0, %xmm1, %xmm0
7064; X64-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
7065; X64-NEXT:    vpmulld %xmm0, %xmm1, %xmm0
7066; X64-NEXT:    vmovd %xmm0, %eax
7067; X64-NEXT:    vzeroupper
7068; X64-NEXT:    retq
7069entry:
7070  %0 = bitcast <8 x i64> %__W to <16 x i32>
7071  %1 = bitcast i16 %__M to <16 x i1>
7072  %2 = select <16 x i1> %1, <16 x i32> %0, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
7073  %3 = bitcast <16 x i32> %2 to <8 x i64>
7074  %extract.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7075  %4 = bitcast <4 x i64> %extract.i to <8 x i32>
7076  %extract4.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
7077  %5 = bitcast <4 x i64> %extract4.i to <8 x i32>
7078  %mul.i = mul <8 x i32> %4, %5
7079  %6 = bitcast <8 x i32> %mul.i to <4 x i64>
7080  %extract5.i = shufflevector <4 x i64> %6, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
7081  %7 = bitcast <2 x i64> %extract5.i to <4 x i32>
7082  %extract6.i = shufflevector <4 x i64> %6, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
7083  %8 = bitcast <2 x i64> %extract6.i to <4 x i32>
7084  %mul7.i = mul <4 x i32> %7, %8
7085  %shuffle.i = shufflevector <4 x i32> %mul7.i, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
7086  %mul8.i = mul <4 x i32> %shuffle.i, %mul7.i
7087  %shuffle9.i = shufflevector <4 x i32> %mul8.i, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
7088  %mul10.i = mul <4 x i32> %shuffle9.i, %mul8.i
7089  %vecext.i = extractelement <4 x i32> %mul10.i, i32 0
7090  ret i32 %vecext.i
7091}
7092
7093define i32 @test_mm512_mask_reduce_and_epi32(i16 zeroext %__M, <8 x i64> %__W) {
7094; X86-LABEL: test_mm512_mask_reduce_and_epi32:
7095; X86:       # %bb.0: # %entry
7096; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
7097; X86-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
7098; X86-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
7099; X86-NEXT:    vextracti64x4 $1, %zmm1, %ymm0
7100; X86-NEXT:    vpand %ymm0, %ymm1, %ymm0
7101; X86-NEXT:    vextracti128 $1, %ymm0, %xmm1
7102; X86-NEXT:    vpand %xmm1, %xmm0, %xmm0
7103; X86-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
7104; X86-NEXT:    vpand %xmm0, %xmm1, %xmm0
7105; X86-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
7106; X86-NEXT:    vpand %xmm0, %xmm1, %xmm0
7107; X86-NEXT:    vmovd %xmm0, %eax
7108; X86-NEXT:    vzeroupper
7109; X86-NEXT:    retl
7110;
7111; X64-LABEL: test_mm512_mask_reduce_and_epi32:
7112; X64:       # %bb.0: # %entry
7113; X64-NEXT:    kmovw %edi, %k1
7114; X64-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
7115; X64-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
7116; X64-NEXT:    vextracti64x4 $1, %zmm1, %ymm0
7117; X64-NEXT:    vpand %ymm0, %ymm1, %ymm0
7118; X64-NEXT:    vextracti128 $1, %ymm0, %xmm1
7119; X64-NEXT:    vpand %xmm1, %xmm0, %xmm0
7120; X64-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
7121; X64-NEXT:    vpand %xmm0, %xmm1, %xmm0
7122; X64-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
7123; X64-NEXT:    vpand %xmm0, %xmm1, %xmm0
7124; X64-NEXT:    vmovd %xmm0, %eax
7125; X64-NEXT:    vzeroupper
7126; X64-NEXT:    retq
7127entry:
7128  %0 = bitcast <8 x i64> %__W to <16 x i32>
7129  %1 = bitcast i16 %__M to <16 x i1>
7130  %2 = select <16 x i1> %1, <16 x i32> %0, <16 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
7131  %3 = bitcast <16 x i32> %2 to <8 x i64>
7132  %extract.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7133  %extract4.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
7134  %and28.i = and <4 x i64> %extract.i, %extract4.i
7135  %extract5.i = shufflevector <4 x i64> %and28.i, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
7136  %extract6.i = shufflevector <4 x i64> %and28.i, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
7137  %and729.i = and <2 x i64> %extract5.i, %extract6.i
7138  %and7.i = bitcast <2 x i64> %and729.i to <4 x i32>
7139  %shuffle.i = shufflevector <4 x i32> %and7.i, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
7140  %and8.i = and <4 x i32> %shuffle.i, %and7.i
7141  %shuffle9.i = shufflevector <4 x i32> %and8.i, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
7142  %and10.i = and <4 x i32> %shuffle9.i, %and8.i
7143  %vecext.i = extractelement <4 x i32> %and10.i, i32 0
7144  ret i32 %vecext.i
7145}
7146
7147define i32 @test_mm512_mask_reduce_or_epi32(i16 zeroext %__M, <8 x i64> %__W) {
7148; X86-LABEL: test_mm512_mask_reduce_or_epi32:
7149; X86:       # %bb.0: # %entry
7150; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
7151; X86-NEXT:    vmovdqa32 %zmm0, %zmm0 {%k1} {z}
7152; X86-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
7153; X86-NEXT:    vpor %ymm1, %ymm0, %ymm0
7154; X86-NEXT:    vextracti128 $1, %ymm0, %xmm1
7155; X86-NEXT:    vpor %xmm1, %xmm0, %xmm0
7156; X86-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
7157; X86-NEXT:    vpor %xmm0, %xmm1, %xmm0
7158; X86-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
7159; X86-NEXT:    vpor %xmm0, %xmm1, %xmm0
7160; X86-NEXT:    vmovd %xmm0, %eax
7161; X86-NEXT:    vzeroupper
7162; X86-NEXT:    retl
7163;
7164; X64-LABEL: test_mm512_mask_reduce_or_epi32:
7165; X64:       # %bb.0: # %entry
7166; X64-NEXT:    kmovw %edi, %k1
7167; X64-NEXT:    vmovdqa32 %zmm0, %zmm0 {%k1} {z}
7168; X64-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
7169; X64-NEXT:    vpor %ymm1, %ymm0, %ymm0
7170; X64-NEXT:    vextracti128 $1, %ymm0, %xmm1
7171; X64-NEXT:    vpor %xmm1, %xmm0, %xmm0
7172; X64-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
7173; X64-NEXT:    vpor %xmm0, %xmm1, %xmm0
7174; X64-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
7175; X64-NEXT:    vpor %xmm0, %xmm1, %xmm0
7176; X64-NEXT:    vmovd %xmm0, %eax
7177; X64-NEXT:    vzeroupper
7178; X64-NEXT:    retq
7179entry:
7180  %0 = bitcast <8 x i64> %__W to <16 x i32>
7181  %1 = bitcast i16 %__M to <16 x i1>
7182  %2 = select <16 x i1> %1, <16 x i32> %0, <16 x i32> zeroinitializer
7183  %3 = bitcast <16 x i32> %2 to <8 x i64>
7184  %extract.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7185  %extract3.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
7186  %or27.i = or <4 x i64> %extract.i, %extract3.i
7187  %extract4.i = shufflevector <4 x i64> %or27.i, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
7188  %extract5.i = shufflevector <4 x i64> %or27.i, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
7189  %or628.i = or <2 x i64> %extract4.i, %extract5.i
7190  %or6.i = bitcast <2 x i64> %or628.i to <4 x i32>
7191  %shuffle.i = shufflevector <4 x i32> %or6.i, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
7192  %or7.i = or <4 x i32> %shuffle.i, %or6.i
7193  %shuffle8.i = shufflevector <4 x i32> %or7.i, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
7194  %or9.i = or <4 x i32> %shuffle8.i, %or7.i
7195  %vecext.i = extractelement <4 x i32> %or9.i, i32 0
7196  ret i32 %vecext.i
7197}
7198
7199define double @test_mm512_reduce_add_pd(<8 x double> %__W) {
7200; X86-LABEL: test_mm512_reduce_add_pd:
7201; X86:       # %bb.0: # %entry
7202; X86-NEXT:    pushl %ebp
7203; X86-NEXT:    .cfi_def_cfa_offset 8
7204; X86-NEXT:    .cfi_offset %ebp, -8
7205; X86-NEXT:    movl %esp, %ebp
7206; X86-NEXT:    .cfi_def_cfa_register %ebp
7207; X86-NEXT:    andl $-8, %esp
7208; X86-NEXT:    subl $8, %esp
7209; X86-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
7210; X86-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
7211; X86-NEXT:    vextractf128 $1, %ymm0, %xmm1
7212; X86-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
7213; X86-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
7214; X86-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
7215; X86-NEXT:    vmovlpd %xmm0, (%esp)
7216; X86-NEXT:    fldl (%esp)
7217; X86-NEXT:    movl %ebp, %esp
7218; X86-NEXT:    popl %ebp
7219; X86-NEXT:    .cfi_def_cfa %esp, 4
7220; X86-NEXT:    vzeroupper
7221; X86-NEXT:    retl
7222;
7223; X64-LABEL: test_mm512_reduce_add_pd:
7224; X64:       # %bb.0: # %entry
7225; X64-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
7226; X64-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
7227; X64-NEXT:    vextractf128 $1, %ymm0, %xmm1
7228; X64-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
7229; X64-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
7230; X64-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
7231; X64-NEXT:    vzeroupper
7232; X64-NEXT:    retq
7233entry:
7234  %shuffle.i = shufflevector <8 x double> %__W, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7235  %shuffle1.i = shufflevector <8 x double> %__W, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
7236  %add.i = fadd <4 x double> %shuffle.i, %shuffle1.i
7237  %shuffle2.i = shufflevector <4 x double> %add.i, <4 x double> undef, <2 x i32> <i32 0, i32 1>
7238  %shuffle3.i = shufflevector <4 x double> %add.i, <4 x double> undef, <2 x i32> <i32 2, i32 3>
7239  %add4.i = fadd <2 x double> %shuffle2.i, %shuffle3.i
7240  %shuffle6.i = shufflevector <2 x double> %add4.i, <2 x double> undef, <2 x i32> <i32 1, i32 0>
7241  %add7.i = fadd <2 x double> %add4.i, %shuffle6.i
7242  %vecext.i = extractelement <2 x double> %add7.i, i32 0
7243  ret double %vecext.i
7244}
7245
7246define double @test_mm512_reduce_mul_pd(<8 x double> %__W) {
7247; X86-LABEL: test_mm512_reduce_mul_pd:
7248; X86:       # %bb.0: # %entry
7249; X86-NEXT:    pushl %ebp
7250; X86-NEXT:    .cfi_def_cfa_offset 8
7251; X86-NEXT:    .cfi_offset %ebp, -8
7252; X86-NEXT:    movl %esp, %ebp
7253; X86-NEXT:    .cfi_def_cfa_register %ebp
7254; X86-NEXT:    andl $-8, %esp
7255; X86-NEXT:    subl $8, %esp
7256; X86-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
7257; X86-NEXT:    vmulpd %ymm1, %ymm0, %ymm0
7258; X86-NEXT:    vextractf128 $1, %ymm0, %xmm1
7259; X86-NEXT:    vmulpd %xmm1, %xmm0, %xmm0
7260; X86-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
7261; X86-NEXT:    vmulpd %xmm1, %xmm0, %xmm0
7262; X86-NEXT:    vmovlpd %xmm0, (%esp)
7263; X86-NEXT:    fldl (%esp)
7264; X86-NEXT:    movl %ebp, %esp
7265; X86-NEXT:    popl %ebp
7266; X86-NEXT:    .cfi_def_cfa %esp, 4
7267; X86-NEXT:    vzeroupper
7268; X86-NEXT:    retl
7269;
7270; X64-LABEL: test_mm512_reduce_mul_pd:
7271; X64:       # %bb.0: # %entry
7272; X64-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
7273; X64-NEXT:    vmulpd %ymm1, %ymm0, %ymm0
7274; X64-NEXT:    vextractf128 $1, %ymm0, %xmm1
7275; X64-NEXT:    vmulpd %xmm1, %xmm0, %xmm0
7276; X64-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
7277; X64-NEXT:    vmulpd %xmm1, %xmm0, %xmm0
7278; X64-NEXT:    vzeroupper
7279; X64-NEXT:    retq
7280entry:
7281  %shuffle.i = shufflevector <8 x double> %__W, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7282  %shuffle1.i = shufflevector <8 x double> %__W, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
7283  %mul.i = fmul <4 x double> %shuffle.i, %shuffle1.i
7284  %shuffle2.i = shufflevector <4 x double> %mul.i, <4 x double> undef, <2 x i32> <i32 0, i32 1>
7285  %shuffle3.i = shufflevector <4 x double> %mul.i, <4 x double> undef, <2 x i32> <i32 2, i32 3>
7286  %mul4.i = fmul <2 x double> %shuffle2.i, %shuffle3.i
7287  %shuffle6.i = shufflevector <2 x double> %mul4.i, <2 x double> undef, <2 x i32> <i32 1, i32 0>
7288  %mul7.i = fmul <2 x double> %mul4.i, %shuffle6.i
7289  %vecext.i = extractelement <2 x double> %mul7.i, i32 0
7290  ret double %vecext.i
7291}
7292
7293define float @test_mm512_reduce_add_ps(<16 x float> %__W) {
7294; X86-LABEL: test_mm512_reduce_add_ps:
7295; X86:       # %bb.0: # %entry
7296; X86-NEXT:    pushl %eax
7297; X86-NEXT:    .cfi_def_cfa_offset 8
7298; X86-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
7299; X86-NEXT:    vaddps %ymm1, %ymm0, %ymm0
7300; X86-NEXT:    vextractf128 $1, %ymm0, %xmm1
7301; X86-NEXT:    vaddps %xmm1, %xmm0, %xmm0
7302; X86-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
7303; X86-NEXT:    vaddps %xmm1, %xmm0, %xmm0
7304; X86-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[1,0,3,2]
7305; X86-NEXT:    vaddps %xmm1, %xmm0, %xmm0
7306; X86-NEXT:    vmovss %xmm0, (%esp)
7307; X86-NEXT:    flds (%esp)
7308; X86-NEXT:    popl %eax
7309; X86-NEXT:    .cfi_def_cfa_offset 4
7310; X86-NEXT:    vzeroupper
7311; X86-NEXT:    retl
7312;
7313; X64-LABEL: test_mm512_reduce_add_ps:
7314; X64:       # %bb.0: # %entry
7315; X64-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
7316; X64-NEXT:    vaddps %ymm1, %ymm0, %ymm0
7317; X64-NEXT:    vextractf128 $1, %ymm0, %xmm1
7318; X64-NEXT:    vaddps %xmm1, %xmm0, %xmm0
7319; X64-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
7320; X64-NEXT:    vaddps %xmm1, %xmm0, %xmm0
7321; X64-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[1,0,3,2]
7322; X64-NEXT:    vaddps %xmm1, %xmm0, %xmm0
7323; X64-NEXT:    vzeroupper
7324; X64-NEXT:    retq
7325entry:
7326  %0 = bitcast <16 x float> %__W to <8 x double>
7327  %extract.i = shufflevector <8 x double> %0, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7328  %1 = bitcast <4 x double> %extract.i to <8 x float>
7329  %extract2.i = shufflevector <8 x double> %0, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
7330  %2 = bitcast <4 x double> %extract2.i to <8 x float>
7331  %add.i = fadd <8 x float> %1, %2
7332  %extract3.i = shufflevector <8 x float> %add.i, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7333  %extract4.i = shufflevector <8 x float> %add.i, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
7334  %add5.i = fadd <4 x float> %extract3.i, %extract4.i
7335  %shuffle.i = shufflevector <4 x float> %add5.i, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
7336  %add6.i = fadd <4 x float> %add5.i, %shuffle.i
7337  %shuffle7.i = shufflevector <4 x float> %add6.i, <4 x float> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
7338  %add8.i = fadd <4 x float> %add6.i, %shuffle7.i
7339  %vecext.i = extractelement <4 x float> %add8.i, i32 0
7340  ret float %vecext.i
7341}
7342
7343define float @test_mm512_reduce_mul_ps(<16 x float> %__W) {
7344; X86-LABEL: test_mm512_reduce_mul_ps:
7345; X86:       # %bb.0: # %entry
7346; X86-NEXT:    pushl %eax
7347; X86-NEXT:    .cfi_def_cfa_offset 8
7348; X86-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
7349; X86-NEXT:    vmulps %ymm1, %ymm0, %ymm0
7350; X86-NEXT:    vextractf128 $1, %ymm0, %xmm1
7351; X86-NEXT:    vmulps %xmm1, %xmm0, %xmm0
7352; X86-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
7353; X86-NEXT:    vmulps %xmm1, %xmm0, %xmm0
7354; X86-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[1,0,3,2]
7355; X86-NEXT:    vmulps %xmm1, %xmm0, %xmm0
7356; X86-NEXT:    vmovss %xmm0, (%esp)
7357; X86-NEXT:    flds (%esp)
7358; X86-NEXT:    popl %eax
7359; X86-NEXT:    .cfi_def_cfa_offset 4
7360; X86-NEXT:    vzeroupper
7361; X86-NEXT:    retl
7362;
7363; X64-LABEL: test_mm512_reduce_mul_ps:
7364; X64:       # %bb.0: # %entry
7365; X64-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
7366; X64-NEXT:    vmulps %ymm1, %ymm0, %ymm0
7367; X64-NEXT:    vextractf128 $1, %ymm0, %xmm1
7368; X64-NEXT:    vmulps %xmm1, %xmm0, %xmm0
7369; X64-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
7370; X64-NEXT:    vmulps %xmm1, %xmm0, %xmm0
7371; X64-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[1,0,3,2]
7372; X64-NEXT:    vmulps %xmm1, %xmm0, %xmm0
7373; X64-NEXT:    vzeroupper
7374; X64-NEXT:    retq
7375entry:
7376  %0 = bitcast <16 x float> %__W to <8 x double>
7377  %extract.i = shufflevector <8 x double> %0, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7378  %1 = bitcast <4 x double> %extract.i to <8 x float>
7379  %extract2.i = shufflevector <8 x double> %0, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
7380  %2 = bitcast <4 x double> %extract2.i to <8 x float>
7381  %mul.i = fmul <8 x float> %1, %2
7382  %extract3.i = shufflevector <8 x float> %mul.i, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7383  %extract4.i = shufflevector <8 x float> %mul.i, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
7384  %mul5.i = fmul <4 x float> %extract3.i, %extract4.i
7385  %shuffle.i = shufflevector <4 x float> %mul5.i, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
7386  %mul6.i = fmul <4 x float> %mul5.i, %shuffle.i
7387  %shuffle7.i = shufflevector <4 x float> %mul6.i, <4 x float> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
7388  %mul8.i = fmul <4 x float> %mul6.i, %shuffle7.i
7389  %vecext.i = extractelement <4 x float> %mul8.i, i32 0
7390  ret float %vecext.i
7391}
7392
7393define double @test_mm512_mask_reduce_add_pd(i8 zeroext %__M, <8 x double> %__W) {
7394; X86-LABEL: test_mm512_mask_reduce_add_pd:
7395; X86:       # %bb.0: # %entry
7396; X86-NEXT:    pushl %ebp
7397; X86-NEXT:    .cfi_def_cfa_offset 8
7398; X86-NEXT:    .cfi_offset %ebp, -8
7399; X86-NEXT:    movl %esp, %ebp
7400; X86-NEXT:    .cfi_def_cfa_register %ebp
7401; X86-NEXT:    andl $-8, %esp
7402; X86-NEXT:    subl $8, %esp
7403; X86-NEXT:    movb 8(%ebp), %al
7404; X86-NEXT:    kmovw %eax, %k1
7405; X86-NEXT:    vmovapd %zmm0, %zmm0 {%k1} {z}
7406; X86-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
7407; X86-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
7408; X86-NEXT:    vextractf128 $1, %ymm0, %xmm1
7409; X86-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
7410; X86-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
7411; X86-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
7412; X86-NEXT:    vmovlpd %xmm0, (%esp)
7413; X86-NEXT:    fldl (%esp)
7414; X86-NEXT:    movl %ebp, %esp
7415; X86-NEXT:    popl %ebp
7416; X86-NEXT:    .cfi_def_cfa %esp, 4
7417; X86-NEXT:    vzeroupper
7418; X86-NEXT:    retl
7419;
7420; X64-LABEL: test_mm512_mask_reduce_add_pd:
7421; X64:       # %bb.0: # %entry
7422; X64-NEXT:    kmovw %edi, %k1
7423; X64-NEXT:    vmovapd %zmm0, %zmm0 {%k1} {z}
7424; X64-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
7425; X64-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
7426; X64-NEXT:    vextractf128 $1, %ymm0, %xmm1
7427; X64-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
7428; X64-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
7429; X64-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
7430; X64-NEXT:    vzeroupper
7431; X64-NEXT:    retq
7432entry:
7433  %0 = bitcast i8 %__M to <8 x i1>
7434  %1 = select <8 x i1> %0, <8 x double> %__W, <8 x double> zeroinitializer
7435  %shuffle.i = shufflevector <8 x double> %1, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7436  %shuffle1.i = shufflevector <8 x double> %1, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
7437  %add.i = fadd <4 x double> %shuffle.i, %shuffle1.i
7438  %shuffle2.i = shufflevector <4 x double> %add.i, <4 x double> undef, <2 x i32> <i32 0, i32 1>
7439  %shuffle3.i = shufflevector <4 x double> %add.i, <4 x double> undef, <2 x i32> <i32 2, i32 3>
7440  %add4.i = fadd <2 x double> %shuffle2.i, %shuffle3.i
7441  %shuffle6.i = shufflevector <2 x double> %add4.i, <2 x double> undef, <2 x i32> <i32 1, i32 0>
7442  %add7.i = fadd <2 x double> %add4.i, %shuffle6.i
7443  %vecext.i = extractelement <2 x double> %add7.i, i32 0
7444  ret double %vecext.i
7445}
7446
7447define double @test_mm512_mask_reduce_mul_pd(i8 zeroext %__M, <8 x double> %__W) {
7448; X86-LABEL: test_mm512_mask_reduce_mul_pd:
7449; X86:       # %bb.0: # %entry
7450; X86-NEXT:    pushl %ebp
7451; X86-NEXT:    .cfi_def_cfa_offset 8
7452; X86-NEXT:    .cfi_offset %ebp, -8
7453; X86-NEXT:    movl %esp, %ebp
7454; X86-NEXT:    .cfi_def_cfa_register %ebp
7455; X86-NEXT:    andl $-8, %esp
7456; X86-NEXT:    subl $8, %esp
7457; X86-NEXT:    movb 8(%ebp), %al
7458; X86-NEXT:    kmovw %eax, %k1
7459; X86-NEXT:    vbroadcastsd {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1]
7460; X86-NEXT:    vmovapd %zmm0, %zmm1 {%k1}
7461; X86-NEXT:    vextractf64x4 $1, %zmm1, %ymm0
7462; X86-NEXT:    vmulpd %ymm0, %ymm1, %ymm0
7463; X86-NEXT:    vextractf128 $1, %ymm0, %xmm1
7464; X86-NEXT:    vmulpd %xmm1, %xmm0, %xmm0
7465; X86-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
7466; X86-NEXT:    vmulpd %xmm1, %xmm0, %xmm0
7467; X86-NEXT:    vmovlpd %xmm0, (%esp)
7468; X86-NEXT:    fldl (%esp)
7469; X86-NEXT:    movl %ebp, %esp
7470; X86-NEXT:    popl %ebp
7471; X86-NEXT:    .cfi_def_cfa %esp, 4
7472; X86-NEXT:    vzeroupper
7473; X86-NEXT:    retl
7474;
7475; X64-LABEL: test_mm512_mask_reduce_mul_pd:
7476; X64:       # %bb.0: # %entry
7477; X64-NEXT:    kmovw %edi, %k1
7478; X64-NEXT:    vbroadcastsd {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1]
7479; X64-NEXT:    vmovapd %zmm0, %zmm1 {%k1}
7480; X64-NEXT:    vextractf64x4 $1, %zmm1, %ymm0
7481; X64-NEXT:    vmulpd %ymm0, %ymm1, %ymm0
7482; X64-NEXT:    vextractf128 $1, %ymm0, %xmm1
7483; X64-NEXT:    vmulpd %xmm1, %xmm0, %xmm0
7484; X64-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
7485; X64-NEXT:    vmulpd %xmm1, %xmm0, %xmm0
7486; X64-NEXT:    vzeroupper
7487; X64-NEXT:    retq
7488entry:
7489  %0 = bitcast i8 %__M to <8 x i1>
7490  %1 = select <8 x i1> %0, <8 x double> %__W, <8 x double> <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00>
7491  %shuffle.i = shufflevector <8 x double> %1, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7492  %shuffle1.i = shufflevector <8 x double> %1, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
7493  %mul.i = fmul <4 x double> %shuffle.i, %shuffle1.i
7494  %shuffle2.i = shufflevector <4 x double> %mul.i, <4 x double> undef, <2 x i32> <i32 0, i32 1>
7495  %shuffle3.i = shufflevector <4 x double> %mul.i, <4 x double> undef, <2 x i32> <i32 2, i32 3>
7496  %mul4.i = fmul <2 x double> %shuffle2.i, %shuffle3.i
7497  %shuffle6.i = shufflevector <2 x double> %mul4.i, <2 x double> undef, <2 x i32> <i32 1, i32 0>
7498  %mul7.i = fmul <2 x double> %mul4.i, %shuffle6.i
7499  %vecext.i = extractelement <2 x double> %mul7.i, i32 0
7500  ret double %vecext.i
7501}
7502
7503define float @test_mm512_mask_reduce_add_ps(i16 zeroext %__M, <16 x float> %__W) {
7504; X86-LABEL: test_mm512_mask_reduce_add_ps:
7505; X86:       # %bb.0: # %entry
7506; X86-NEXT:    pushl %eax
7507; X86-NEXT:    .cfi_def_cfa_offset 8
7508; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
7509; X86-NEXT:    vmovaps %zmm0, %zmm0 {%k1} {z}
7510; X86-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
7511; X86-NEXT:    vaddps %ymm1, %ymm0, %ymm0
7512; X86-NEXT:    vextractf128 $1, %ymm0, %xmm1
7513; X86-NEXT:    vaddps %xmm1, %xmm0, %xmm0
7514; X86-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
7515; X86-NEXT:    vaddps %xmm1, %xmm0, %xmm0
7516; X86-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[1,0,3,2]
7517; X86-NEXT:    vaddps %xmm1, %xmm0, %xmm0
7518; X86-NEXT:    vmovss %xmm0, (%esp)
7519; X86-NEXT:    flds (%esp)
7520; X86-NEXT:    popl %eax
7521; X86-NEXT:    .cfi_def_cfa_offset 4
7522; X86-NEXT:    vzeroupper
7523; X86-NEXT:    retl
7524;
7525; X64-LABEL: test_mm512_mask_reduce_add_ps:
7526; X64:       # %bb.0: # %entry
7527; X64-NEXT:    kmovw %edi, %k1
7528; X64-NEXT:    vmovaps %zmm0, %zmm0 {%k1} {z}
7529; X64-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
7530; X64-NEXT:    vaddps %ymm1, %ymm0, %ymm0
7531; X64-NEXT:    vextractf128 $1, %ymm0, %xmm1
7532; X64-NEXT:    vaddps %xmm1, %xmm0, %xmm0
7533; X64-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
7534; X64-NEXT:    vaddps %xmm1, %xmm0, %xmm0
7535; X64-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[1,0,3,2]
7536; X64-NEXT:    vaddps %xmm1, %xmm0, %xmm0
7537; X64-NEXT:    vzeroupper
7538; X64-NEXT:    retq
7539entry:
7540  %0 = bitcast i16 %__M to <16 x i1>
7541  %1 = select <16 x i1> %0, <16 x float> %__W, <16 x float> zeroinitializer
7542  %2 = bitcast <16 x float> %1 to <8 x double>
7543  %extract.i = shufflevector <8 x double> %2, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7544  %3 = bitcast <4 x double> %extract.i to <8 x float>
7545  %extract3.i = shufflevector <8 x double> %2, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
7546  %4 = bitcast <4 x double> %extract3.i to <8 x float>
7547  %add.i = fadd <8 x float> %3, %4
7548  %extract4.i = shufflevector <8 x float> %add.i, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7549  %extract5.i = shufflevector <8 x float> %add.i, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
7550  %add6.i = fadd <4 x float> %extract4.i, %extract5.i
7551  %shuffle.i = shufflevector <4 x float> %add6.i, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
7552  %add7.i = fadd <4 x float> %add6.i, %shuffle.i
7553  %shuffle8.i = shufflevector <4 x float> %add7.i, <4 x float> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
7554  %add9.i = fadd <4 x float> %add7.i, %shuffle8.i
7555  %vecext.i = extractelement <4 x float> %add9.i, i32 0
7556  ret float %vecext.i
7557}
7558
7559define float @test_mm512_mask_reduce_mul_ps(i16 zeroext %__M, <16 x float> %__W) {
7560; X86-LABEL: test_mm512_mask_reduce_mul_ps:
7561; X86:       # %bb.0: # %entry
7562; X86-NEXT:    pushl %eax
7563; X86-NEXT:    .cfi_def_cfa_offset 8
7564; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
7565; X86-NEXT:    vbroadcastss {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
7566; X86-NEXT:    vmovaps %zmm0, %zmm1 {%k1}
7567; X86-NEXT:    vextractf64x4 $1, %zmm1, %ymm0
7568; X86-NEXT:    vmulps %ymm0, %ymm1, %ymm0
7569; X86-NEXT:    vextractf128 $1, %ymm0, %xmm1
7570; X86-NEXT:    vmulps %xmm1, %xmm0, %xmm0
7571; X86-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
7572; X86-NEXT:    vmulps %xmm1, %xmm0, %xmm0
7573; X86-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[1,0,3,2]
7574; X86-NEXT:    vmulps %xmm1, %xmm0, %xmm0
7575; X86-NEXT:    vmovss %xmm0, (%esp)
7576; X86-NEXT:    flds (%esp)
7577; X86-NEXT:    popl %eax
7578; X86-NEXT:    .cfi_def_cfa_offset 4
7579; X86-NEXT:    vzeroupper
7580; X86-NEXT:    retl
7581;
7582; X64-LABEL: test_mm512_mask_reduce_mul_ps:
7583; X64:       # %bb.0: # %entry
7584; X64-NEXT:    kmovw %edi, %k1
7585; X64-NEXT:    vbroadcastss {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
7586; X64-NEXT:    vmovaps %zmm0, %zmm1 {%k1}
7587; X64-NEXT:    vextractf64x4 $1, %zmm1, %ymm0
7588; X64-NEXT:    vmulps %ymm0, %ymm1, %ymm0
7589; X64-NEXT:    vextractf128 $1, %ymm0, %xmm1
7590; X64-NEXT:    vmulps %xmm1, %xmm0, %xmm0
7591; X64-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
7592; X64-NEXT:    vmulps %xmm1, %xmm0, %xmm0
7593; X64-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[1,0,3,2]
7594; X64-NEXT:    vmulps %xmm1, %xmm0, %xmm0
7595; X64-NEXT:    vzeroupper
7596; X64-NEXT:    retq
7597entry:
7598  %0 = bitcast i16 %__M to <16 x i1>
7599  %1 = select <16 x i1> %0, <16 x float> %__W, <16 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
7600  %2 = bitcast <16 x float> %1 to <8 x double>
7601  %extract.i = shufflevector <8 x double> %2, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7602  %3 = bitcast <4 x double> %extract.i to <8 x float>
7603  %extract4.i = shufflevector <8 x double> %2, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
7604  %4 = bitcast <4 x double> %extract4.i to <8 x float>
7605  %mul.i = fmul <8 x float> %3, %4
7606  %extract5.i = shufflevector <8 x float> %mul.i, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7607  %extract6.i = shufflevector <8 x float> %mul.i, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
7608  %mul7.i = fmul <4 x float> %extract5.i, %extract6.i
7609  %shuffle.i = shufflevector <4 x float> %mul7.i, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
7610  %mul8.i = fmul <4 x float> %mul7.i, %shuffle.i
7611  %shuffle9.i = shufflevector <4 x float> %mul8.i, <4 x float> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
7612  %mul10.i = fmul <4 x float> %mul8.i, %shuffle9.i
7613  %vecext.i = extractelement <4 x float> %mul10.i, i32 0
7614  ret float %vecext.i
7615}
7616
7617define i64 @test_mm512_reduce_max_epi64(<8 x i64> %__W) {
7618; X86-LABEL: test_mm512_reduce_max_epi64:
7619; X86:       # %bb.0: # %entry
7620; X86-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,2,3]
7621; X86-NEXT:    vpmaxsq %zmm0, %zmm1, %zmm0
7622; X86-NEXT:    vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5]
7623; X86-NEXT:    vpmaxsq %zmm1, %zmm0, %zmm0
7624; X86-NEXT:    vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13]
7625; X86-NEXT:    vpmaxsq %zmm1, %zmm0, %zmm0
7626; X86-NEXT:    vmovd %xmm0, %eax
7627; X86-NEXT:    vpextrd $1, %xmm0, %edx
7628; X86-NEXT:    vzeroupper
7629; X86-NEXT:    retl
7630;
7631; X64-LABEL: test_mm512_reduce_max_epi64:
7632; X64:       # %bb.0: # %entry
7633; X64-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,2,3]
7634; X64-NEXT:    vpmaxsq %zmm0, %zmm1, %zmm0
7635; X64-NEXT:    vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5]
7636; X64-NEXT:    vpmaxsq %zmm1, %zmm0, %zmm0
7637; X64-NEXT:    vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13]
7638; X64-NEXT:    vpmaxsq %zmm1, %zmm0, %zmm0
7639; X64-NEXT:    vmovq %xmm0, %rax
7640; X64-NEXT:    vzeroupper
7641; X64-NEXT:    retq
7642entry:
7643  %shuffle.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
7644  %0 = icmp slt <8 x i64> %shuffle.i, %__W
7645  %1 = select <8 x i1> %0, <8 x i64> %__W, <8 x i64> %shuffle.i
7646  %shuffle1.i = shufflevector <8 x i64> %1, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5>
7647  %2 = icmp sgt <8 x i64> %1, %shuffle1.i
7648  %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %shuffle1.i
7649  %shuffle3.i = shufflevector <8 x i64> %3, <8 x i64> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
7650  %4 = icmp sgt <8 x i64> %3, %shuffle3.i
7651  %5 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> %shuffle3.i
7652  %vecext.i = extractelement <8 x i64> %5, i32 0
7653  ret i64 %vecext.i
7654}
7655
7656define i64 @test_mm512_reduce_max_epu64(<8 x i64> %__W) {
7657; X86-LABEL: test_mm512_reduce_max_epu64:
7658; X86:       # %bb.0: # %entry
7659; X86-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,2,3]
7660; X86-NEXT:    vpmaxuq %zmm0, %zmm1, %zmm0
7661; X86-NEXT:    vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5]
7662; X86-NEXT:    vpmaxuq %zmm1, %zmm0, %zmm0
7663; X86-NEXT:    vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13]
7664; X86-NEXT:    vpmaxuq %zmm1, %zmm0, %zmm0
7665; X86-NEXT:    vmovd %xmm0, %eax
7666; X86-NEXT:    vpextrd $1, %xmm0, %edx
7667; X86-NEXT:    vzeroupper
7668; X86-NEXT:    retl
7669;
7670; X64-LABEL: test_mm512_reduce_max_epu64:
7671; X64:       # %bb.0: # %entry
7672; X64-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,2,3]
7673; X64-NEXT:    vpmaxuq %zmm0, %zmm1, %zmm0
7674; X64-NEXT:    vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5]
7675; X64-NEXT:    vpmaxuq %zmm1, %zmm0, %zmm0
7676; X64-NEXT:    vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13]
7677; X64-NEXT:    vpmaxuq %zmm1, %zmm0, %zmm0
7678; X64-NEXT:    vmovq %xmm0, %rax
7679; X64-NEXT:    vzeroupper
7680; X64-NEXT:    retq
7681entry:
7682  %shuffle.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
7683  %0 = icmp ult <8 x i64> %shuffle.i, %__W
7684  %1 = select <8 x i1> %0, <8 x i64> %__W, <8 x i64> %shuffle.i
7685  %shuffle1.i = shufflevector <8 x i64> %1, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5>
7686  %2 = icmp ugt <8 x i64> %1, %shuffle1.i
7687  %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %shuffle1.i
7688  %shuffle3.i = shufflevector <8 x i64> %3, <8 x i64> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
7689  %4 = icmp ugt <8 x i64> %3, %shuffle3.i
7690  %5 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> %shuffle3.i
7691  %vecext.i = extractelement <8 x i64> %5, i32 0
7692  ret i64 %vecext.i
7693}
7694
7695define double @test_mm512_reduce_max_pd(<8 x double> %__W) {
7696; X86-LABEL: test_mm512_reduce_max_pd:
7697; X86:       # %bb.0: # %entry
7698; X86-NEXT:    pushl %ebp
7699; X86-NEXT:    .cfi_def_cfa_offset 8
7700; X86-NEXT:    .cfi_offset %ebp, -8
7701; X86-NEXT:    movl %esp, %ebp
7702; X86-NEXT:    .cfi_def_cfa_register %ebp
7703; X86-NEXT:    andl $-8, %esp
7704; X86-NEXT:    subl $8, %esp
7705; X86-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
7706; X86-NEXT:    vmaxpd %ymm1, %ymm0, %ymm0
7707; X86-NEXT:    vextractf128 $1, %ymm0, %xmm1
7708; X86-NEXT:    vmaxpd %xmm1, %xmm0, %xmm0
7709; X86-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
7710; X86-NEXT:    vmaxpd %xmm1, %xmm0, %xmm0
7711; X86-NEXT:    vmovlpd %xmm0, (%esp)
7712; X86-NEXT:    fldl (%esp)
7713; X86-NEXT:    movl %ebp, %esp
7714; X86-NEXT:    popl %ebp
7715; X86-NEXT:    .cfi_def_cfa %esp, 4
7716; X86-NEXT:    vzeroupper
7717; X86-NEXT:    retl
7718;
7719; X64-LABEL: test_mm512_reduce_max_pd:
7720; X64:       # %bb.0: # %entry
7721; X64-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
7722; X64-NEXT:    vmaxpd %ymm1, %ymm0, %ymm0
7723; X64-NEXT:    vextractf128 $1, %ymm0, %xmm1
7724; X64-NEXT:    vmaxpd %xmm1, %xmm0, %xmm0
7725; X64-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
7726; X64-NEXT:    vmaxpd %xmm1, %xmm0, %xmm0
7727; X64-NEXT:    vzeroupper
7728; X64-NEXT:    retq
7729entry:
7730  %extract.i = shufflevector <8 x double> %__W, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7731  %extract2.i = shufflevector <8 x double> %__W, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
7732  %0 = tail call <4 x double> @llvm.x86.avx.max.pd.256(<4 x double> %extract.i, <4 x double> %extract2.i)
7733  %extract4.i = shufflevector <4 x double> %0, <4 x double> undef, <2 x i32> <i32 0, i32 1>
7734  %extract5.i = shufflevector <4 x double> %0, <4 x double> undef, <2 x i32> <i32 2, i32 3>
7735  %1 = tail call <2 x double> @llvm.x86.sse2.max.pd(<2 x double> %extract4.i, <2 x double> %extract5.i)
7736  %shuffle.i = shufflevector <2 x double> %1, <2 x double> undef, <2 x i32> <i32 1, i32 0>
7737  %2 = tail call <2 x double> @llvm.x86.sse2.max.pd(<2 x double> %1, <2 x double> %shuffle.i)
7738  %vecext.i = extractelement <2 x double> %2, i32 0
7739  ret double %vecext.i
7740}
7741
7742define i64 @test_mm512_reduce_min_epi64(<8 x i64> %__W) {
7743; X86-LABEL: test_mm512_reduce_min_epi64:
7744; X86:       # %bb.0: # %entry
7745; X86-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,2,3]
7746; X86-NEXT:    vpminsq %zmm0, %zmm1, %zmm0
7747; X86-NEXT:    vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5]
7748; X86-NEXT:    vpminsq %zmm1, %zmm0, %zmm0
7749; X86-NEXT:    vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13]
7750; X86-NEXT:    vpminsq %zmm1, %zmm0, %zmm0
7751; X86-NEXT:    vmovd %xmm0, %eax
7752; X86-NEXT:    vpextrd $1, %xmm0, %edx
7753; X86-NEXT:    vzeroupper
7754; X86-NEXT:    retl
7755;
7756; X64-LABEL: test_mm512_reduce_min_epi64:
7757; X64:       # %bb.0: # %entry
7758; X64-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,2,3]
7759; X64-NEXT:    vpminsq %zmm0, %zmm1, %zmm0
7760; X64-NEXT:    vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5]
7761; X64-NEXT:    vpminsq %zmm1, %zmm0, %zmm0
7762; X64-NEXT:    vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13]
7763; X64-NEXT:    vpminsq %zmm1, %zmm0, %zmm0
7764; X64-NEXT:    vmovq %xmm0, %rax
7765; X64-NEXT:    vzeroupper
7766; X64-NEXT:    retq
7767entry:
7768  %shuffle.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
7769  %0 = icmp sgt <8 x i64> %shuffle.i, %__W
7770  %1 = select <8 x i1> %0, <8 x i64> %__W, <8 x i64> %shuffle.i
7771  %shuffle1.i = shufflevector <8 x i64> %1, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5>
7772  %2 = icmp slt <8 x i64> %1, %shuffle1.i
7773  %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %shuffle1.i
7774  %shuffle3.i = shufflevector <8 x i64> %3, <8 x i64> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
7775  %4 = icmp slt <8 x i64> %3, %shuffle3.i
7776  %5 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> %shuffle3.i
7777  %vecext.i = extractelement <8 x i64> %5, i32 0
7778  ret i64 %vecext.i
7779}
7780
7781define i64 @test_mm512_reduce_min_epu64(<8 x i64> %__W) {
7782; X86-LABEL: test_mm512_reduce_min_epu64:
7783; X86:       # %bb.0: # %entry
7784; X86-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,2,3]
7785; X86-NEXT:    vpminuq %zmm0, %zmm1, %zmm0
7786; X86-NEXT:    vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5]
7787; X86-NEXT:    vpminuq %zmm1, %zmm0, %zmm0
7788; X86-NEXT:    vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13]
7789; X86-NEXT:    vpminuq %zmm1, %zmm0, %zmm0
7790; X86-NEXT:    vmovd %xmm0, %eax
7791; X86-NEXT:    vpextrd $1, %xmm0, %edx
7792; X86-NEXT:    vzeroupper
7793; X86-NEXT:    retl
7794;
7795; X64-LABEL: test_mm512_reduce_min_epu64:
7796; X64:       # %bb.0: # %entry
7797; X64-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,2,3]
7798; X64-NEXT:    vpminuq %zmm0, %zmm1, %zmm0
7799; X64-NEXT:    vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5]
7800; X64-NEXT:    vpminuq %zmm1, %zmm0, %zmm0
7801; X64-NEXT:    vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13]
7802; X64-NEXT:    vpminuq %zmm1, %zmm0, %zmm0
7803; X64-NEXT:    vmovq %xmm0, %rax
7804; X64-NEXT:    vzeroupper
7805; X64-NEXT:    retq
7806entry:
7807  %shuffle.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
7808  %0 = icmp ugt <8 x i64> %shuffle.i, %__W
7809  %1 = select <8 x i1> %0, <8 x i64> %__W, <8 x i64> %shuffle.i
7810  %shuffle1.i = shufflevector <8 x i64> %1, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5>
7811  %2 = icmp ult <8 x i64> %1, %shuffle1.i
7812  %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %shuffle1.i
7813  %shuffle3.i = shufflevector <8 x i64> %3, <8 x i64> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
7814  %4 = icmp ult <8 x i64> %3, %shuffle3.i
7815  %5 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> %shuffle3.i
7816  %vecext.i = extractelement <8 x i64> %5, i32 0
7817  ret i64 %vecext.i
7818}
7819
7820define double @test_mm512_reduce_min_pd(<8 x double> %__W) {
7821; X86-LABEL: test_mm512_reduce_min_pd:
7822; X86:       # %bb.0: # %entry
7823; X86-NEXT:    pushl %ebp
7824; X86-NEXT:    .cfi_def_cfa_offset 8
7825; X86-NEXT:    .cfi_offset %ebp, -8
7826; X86-NEXT:    movl %esp, %ebp
7827; X86-NEXT:    .cfi_def_cfa_register %ebp
7828; X86-NEXT:    andl $-8, %esp
7829; X86-NEXT:    subl $8, %esp
7830; X86-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
7831; X86-NEXT:    vminpd %ymm1, %ymm0, %ymm0
7832; X86-NEXT:    vextractf128 $1, %ymm0, %xmm1
7833; X86-NEXT:    vminpd %xmm1, %xmm0, %xmm0
7834; X86-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
7835; X86-NEXT:    vminpd %xmm1, %xmm0, %xmm0
7836; X86-NEXT:    vmovlpd %xmm0, (%esp)
7837; X86-NEXT:    fldl (%esp)
7838; X86-NEXT:    movl %ebp, %esp
7839; X86-NEXT:    popl %ebp
7840; X86-NEXT:    .cfi_def_cfa %esp, 4
7841; X86-NEXT:    vzeroupper
7842; X86-NEXT:    retl
7843;
7844; X64-LABEL: test_mm512_reduce_min_pd:
7845; X64:       # %bb.0: # %entry
7846; X64-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
7847; X64-NEXT:    vminpd %ymm1, %ymm0, %ymm0
7848; X64-NEXT:    vextractf128 $1, %ymm0, %xmm1
7849; X64-NEXT:    vminpd %xmm1, %xmm0, %xmm0
7850; X64-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
7851; X64-NEXT:    vminpd %xmm1, %xmm0, %xmm0
7852; X64-NEXT:    vzeroupper
7853; X64-NEXT:    retq
7854entry:
7855  %extract.i = shufflevector <8 x double> %__W, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7856  %extract2.i = shufflevector <8 x double> %__W, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
7857  %0 = tail call <4 x double> @llvm.x86.avx.min.pd.256(<4 x double> %extract.i, <4 x double> %extract2.i)
7858  %extract4.i = shufflevector <4 x double> %0, <4 x double> undef, <2 x i32> <i32 0, i32 1>
7859  %extract5.i = shufflevector <4 x double> %0, <4 x double> undef, <2 x i32> <i32 2, i32 3>
7860  %1 = tail call <2 x double> @llvm.x86.sse2.min.pd(<2 x double> %extract4.i, <2 x double> %extract5.i)
7861  %shuffle.i = shufflevector <2 x double> %1, <2 x double> undef, <2 x i32> <i32 1, i32 0>
7862  %2 = tail call <2 x double> @llvm.x86.sse2.min.pd(<2 x double> %1, <2 x double> %shuffle.i)
7863  %vecext.i = extractelement <2 x double> %2, i32 0
7864  ret double %vecext.i
7865}
7866
7867define i64 @test_mm512_mask_reduce_max_epi64(i8 zeroext %__M, <8 x i64> %__W) {
7868; X86-LABEL: test_mm512_mask_reduce_max_epi64:
7869; X86:       # %bb.0: # %entry
7870; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
7871; X86-NEXT:    kmovw %eax, %k1
7872; X86-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,2147483648,0,2147483648,0,2147483648,0,2147483648,0,2147483648,0,2147483648,0,2147483648,0,2147483648]
7873; X86-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
7874; X86-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm1[4,5,6,7,0,1,2,3]
7875; X86-NEXT:    vpmaxsq %zmm0, %zmm1, %zmm0
7876; X86-NEXT:    vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5]
7877; X86-NEXT:    vpmaxsq %zmm1, %zmm0, %zmm0
7878; X86-NEXT:    vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13]
7879; X86-NEXT:    vpmaxsq %zmm1, %zmm0, %zmm0
7880; X86-NEXT:    vmovd %xmm0, %eax
7881; X86-NEXT:    vpextrd $1, %xmm0, %edx
7882; X86-NEXT:    vzeroupper
7883; X86-NEXT:    retl
7884;
7885; X64-LABEL: test_mm512_mask_reduce_max_epi64:
7886; X64:       # %bb.0: # %entry
7887; X64-NEXT:    kmovw %edi, %k1
7888; X64-NEXT:    vpbroadcastq {{.*#+}} zmm1 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
7889; X64-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
7890; X64-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm1[4,5,6,7,0,1,2,3]
7891; X64-NEXT:    vpmaxsq %zmm0, %zmm1, %zmm0
7892; X64-NEXT:    vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5]
7893; X64-NEXT:    vpmaxsq %zmm1, %zmm0, %zmm0
7894; X64-NEXT:    vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13]
7895; X64-NEXT:    vpmaxsq %zmm1, %zmm0, %zmm0
7896; X64-NEXT:    vmovq %xmm0, %rax
7897; X64-NEXT:    vzeroupper
7898; X64-NEXT:    retq
7899entry:
7900  %0 = bitcast i8 %__M to <8 x i1>
7901  %1 = select <8 x i1> %0, <8 x i64> %__W, <8 x i64> <i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808>
7902  %shuffle.i = shufflevector <8 x i64> %1, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
7903  %2 = icmp sgt <8 x i64> %1, %shuffle.i
7904  %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %shuffle.i
7905  %shuffle3.i = shufflevector <8 x i64> %3, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5>
7906  %4 = icmp sgt <8 x i64> %3, %shuffle3.i
7907  %5 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> %shuffle3.i
7908  %shuffle5.i = shufflevector <8 x i64> %5, <8 x i64> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
7909  %6 = icmp sgt <8 x i64> %5, %shuffle5.i
7910  %7 = select <8 x i1> %6, <8 x i64> %5, <8 x i64> %shuffle5.i
7911  %vecext.i = extractelement <8 x i64> %7, i32 0
7912  ret i64 %vecext.i
7913}
7914
7915define i64 @test_mm512_mask_reduce_max_epu64(i8 zeroext %__M, <8 x i64> %__W) {
7916; X86-LABEL: test_mm512_mask_reduce_max_epu64:
7917; X86:       # %bb.0: # %entry
7918; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
7919; X86-NEXT:    kmovw %eax, %k1
7920; X86-NEXT:    vmovdqa64 %zmm0, %zmm0 {%k1} {z}
7921; X86-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,2,3]
7922; X86-NEXT:    vpmaxuq %zmm1, %zmm0, %zmm0
7923; X86-NEXT:    vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5]
7924; X86-NEXT:    vpmaxuq %zmm1, %zmm0, %zmm0
7925; X86-NEXT:    vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13]
7926; X86-NEXT:    vpmaxuq %zmm1, %zmm0, %zmm0
7927; X86-NEXT:    vmovd %xmm0, %eax
7928; X86-NEXT:    vpextrd $1, %xmm0, %edx
7929; X86-NEXT:    vzeroupper
7930; X86-NEXT:    retl
7931;
7932; X64-LABEL: test_mm512_mask_reduce_max_epu64:
7933; X64:       # %bb.0: # %entry
7934; X64-NEXT:    kmovw %edi, %k1
7935; X64-NEXT:    vmovdqa64 %zmm0, %zmm0 {%k1} {z}
7936; X64-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,2,3]
7937; X64-NEXT:    vpmaxuq %zmm1, %zmm0, %zmm0
7938; X64-NEXT:    vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5]
7939; X64-NEXT:    vpmaxuq %zmm1, %zmm0, %zmm0
7940; X64-NEXT:    vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13]
7941; X64-NEXT:    vpmaxuq %zmm1, %zmm0, %zmm0
7942; X64-NEXT:    vmovq %xmm0, %rax
7943; X64-NEXT:    vzeroupper
7944; X64-NEXT:    retq
7945entry:
7946  %0 = bitcast i8 %__M to <8 x i1>
7947  %1 = select <8 x i1> %0, <8 x i64> %__W, <8 x i64> zeroinitializer
7948  %shuffle.i = shufflevector <8 x i64> %1, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
7949  %2 = icmp ugt <8 x i64> %1, %shuffle.i
7950  %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %shuffle.i
7951  %shuffle2.i = shufflevector <8 x i64> %3, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5>
7952  %4 = icmp ugt <8 x i64> %3, %shuffle2.i
7953  %5 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> %shuffle2.i
7954  %shuffle4.i = shufflevector <8 x i64> %5, <8 x i64> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
7955  %6 = icmp ugt <8 x i64> %5, %shuffle4.i
7956  %7 = select <8 x i1> %6, <8 x i64> %5, <8 x i64> %shuffle4.i
7957  %vecext.i = extractelement <8 x i64> %7, i32 0
7958  ret i64 %vecext.i
7959}
7960
7961define double @test_mm512_mask_reduce_max_pd(i8 zeroext %__M, <8 x double> %__W) {
7962; X86-LABEL: test_mm512_mask_reduce_max_pd:
7963; X86:       # %bb.0: # %entry
7964; X86-NEXT:    pushl %ebp
7965; X86-NEXT:    .cfi_def_cfa_offset 8
7966; X86-NEXT:    .cfi_offset %ebp, -8
7967; X86-NEXT:    movl %esp, %ebp
7968; X86-NEXT:    .cfi_def_cfa_register %ebp
7969; X86-NEXT:    andl $-8, %esp
7970; X86-NEXT:    subl $8, %esp
7971; X86-NEXT:    movb 8(%ebp), %al
7972; X86-NEXT:    kmovw %eax, %k1
7973; X86-NEXT:    vbroadcastsd {{.*#+}} zmm1 = [-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf]
7974; X86-NEXT:    vmovapd %zmm0, %zmm1 {%k1}
7975; X86-NEXT:    vextractf64x4 $1, %zmm1, %ymm0
7976; X86-NEXT:    vmaxpd %ymm0, %ymm1, %ymm0
7977; X86-NEXT:    vextractf128 $1, %ymm0, %xmm1
7978; X86-NEXT:    vmaxpd %xmm1, %xmm0, %xmm0
7979; X86-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
7980; X86-NEXT:    vmaxpd %xmm1, %xmm0, %xmm0
7981; X86-NEXT:    vmovlpd %xmm0, (%esp)
7982; X86-NEXT:    fldl (%esp)
7983; X86-NEXT:    movl %ebp, %esp
7984; X86-NEXT:    popl %ebp
7985; X86-NEXT:    .cfi_def_cfa %esp, 4
7986; X86-NEXT:    vzeroupper
7987; X86-NEXT:    retl
7988;
7989; X64-LABEL: test_mm512_mask_reduce_max_pd:
7990; X64:       # %bb.0: # %entry
7991; X64-NEXT:    kmovw %edi, %k1
7992; X64-NEXT:    vbroadcastsd {{.*#+}} zmm1 = [-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf]
7993; X64-NEXT:    vmovapd %zmm0, %zmm1 {%k1}
7994; X64-NEXT:    vextractf64x4 $1, %zmm1, %ymm0
7995; X64-NEXT:    vmaxpd %ymm0, %ymm1, %ymm0
7996; X64-NEXT:    vextractf128 $1, %ymm0, %xmm1
7997; X64-NEXT:    vmaxpd %xmm1, %xmm0, %xmm0
7998; X64-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
7999; X64-NEXT:    vmaxpd %xmm1, %xmm0, %xmm0
8000; X64-NEXT:    vzeroupper
8001; X64-NEXT:    retq
8002entry:
8003  %0 = bitcast i8 %__M to <8 x i1>
8004  %1 = select <8 x i1> %0, <8 x double> %__W, <8 x double> <double 0xFFF0000000000000, double 0xFFF0000000000000, double 0xFFF0000000000000, double 0xFFF0000000000000, double 0xFFF0000000000000, double 0xFFF0000000000000, double 0xFFF0000000000000, double 0xFFF0000000000000>
8005  %extract.i = shufflevector <8 x double> %1, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
8006  %extract4.i = shufflevector <8 x double> %1, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
8007  %2 = tail call <4 x double> @llvm.x86.avx.max.pd.256(<4 x double> %extract.i, <4 x double> %extract4.i) #3
8008  %extract6.i = shufflevector <4 x double> %2, <4 x double> undef, <2 x i32> <i32 0, i32 1>
8009  %extract7.i = shufflevector <4 x double> %2, <4 x double> undef, <2 x i32> <i32 2, i32 3>
8010  %3 = tail call <2 x double> @llvm.x86.sse2.max.pd(<2 x double> %extract6.i, <2 x double> %extract7.i) #3
8011  %shuffle.i = shufflevector <2 x double> %3, <2 x double> undef, <2 x i32> <i32 1, i32 0>
8012  %4 = tail call <2 x double> @llvm.x86.sse2.max.pd(<2 x double> %3, <2 x double> %shuffle.i) #3
8013  %vecext.i = extractelement <2 x double> %4, i32 0
8014  ret double %vecext.i
8015}
8016
8017define i64 @test_mm512_mask_reduce_min_epi64(i8 zeroext %__M, <8 x i64> %__W) {
8018; X86-LABEL: test_mm512_mask_reduce_min_epi64:
8019; X86:       # %bb.0: # %entry
8020; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
8021; X86-NEXT:    kmovw %eax, %k1
8022; X86-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [4294967295,2147483647,4294967295,2147483647,4294967295,2147483647,4294967295,2147483647,4294967295,2147483647,4294967295,2147483647,4294967295,2147483647,4294967295,2147483647]
8023; X86-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
8024; X86-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm1[4,5,6,7,0,1,2,3]
8025; X86-NEXT:    vpminsq %zmm0, %zmm1, %zmm0
8026; X86-NEXT:    vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5]
8027; X86-NEXT:    vpminsq %zmm1, %zmm0, %zmm0
8028; X86-NEXT:    vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13]
8029; X86-NEXT:    vpminsq %zmm1, %zmm0, %zmm0
8030; X86-NEXT:    vmovd %xmm0, %eax
8031; X86-NEXT:    vpextrd $1, %xmm0, %edx
8032; X86-NEXT:    vzeroupper
8033; X86-NEXT:    retl
8034;
8035; X64-LABEL: test_mm512_mask_reduce_min_epi64:
8036; X64:       # %bb.0: # %entry
8037; X64-NEXT:    kmovw %edi, %k1
8038; X64-NEXT:    vpbroadcastq {{.*#+}} zmm1 = [9223372036854775807,9223372036854775807,9223372036854775807,9223372036854775807,9223372036854775807,9223372036854775807,9223372036854775807,9223372036854775807]
8039; X64-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
8040; X64-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm1[4,5,6,7,0,1,2,3]
8041; X64-NEXT:    vpminsq %zmm0, %zmm1, %zmm0
8042; X64-NEXT:    vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5]
8043; X64-NEXT:    vpminsq %zmm1, %zmm0, %zmm0
8044; X64-NEXT:    vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13]
8045; X64-NEXT:    vpminsq %zmm1, %zmm0, %zmm0
8046; X64-NEXT:    vmovq %xmm0, %rax
8047; X64-NEXT:    vzeroupper
8048; X64-NEXT:    retq
8049entry:
8050  %0 = bitcast i8 %__M to <8 x i1>
8051  %1 = select <8 x i1> %0, <8 x i64> %__W, <8 x i64> <i64 9223372036854775807, i64 9223372036854775807, i64 9223372036854775807, i64 9223372036854775807, i64 9223372036854775807, i64 9223372036854775807, i64 9223372036854775807, i64 9223372036854775807>
8052  %shuffle.i = shufflevector <8 x i64> %1, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
8053  %2 = icmp slt <8 x i64> %1, %shuffle.i
8054  %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %shuffle.i
8055  %shuffle3.i = shufflevector <8 x i64> %3, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5>
8056  %4 = icmp slt <8 x i64> %3, %shuffle3.i
8057  %5 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> %shuffle3.i
8058  %shuffle5.i = shufflevector <8 x i64> %5, <8 x i64> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
8059  %6 = icmp slt <8 x i64> %5, %shuffle5.i
8060  %7 = select <8 x i1> %6, <8 x i64> %5, <8 x i64> %shuffle5.i
8061  %vecext.i = extractelement <8 x i64> %7, i32 0
8062  ret i64 %vecext.i
8063}
8064
8065define i64 @test_mm512_mask_reduce_min_epu64(i8 zeroext %__M, <8 x i64> %__W) {
8066; X86-LABEL: test_mm512_mask_reduce_min_epu64:
8067; X86:       # %bb.0: # %entry
8068; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
8069; X86-NEXT:    kmovw %eax, %k1
8070; X86-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
8071; X86-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
8072; X86-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm1[4,5,6,7,0,1,2,3]
8073; X86-NEXT:    vpminuq %zmm0, %zmm1, %zmm0
8074; X86-NEXT:    vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5]
8075; X86-NEXT:    vpminuq %zmm1, %zmm0, %zmm0
8076; X86-NEXT:    vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13]
8077; X86-NEXT:    vpminuq %zmm1, %zmm0, %zmm0
8078; X86-NEXT:    vmovd %xmm0, %eax
8079; X86-NEXT:    vpextrd $1, %xmm0, %edx
8080; X86-NEXT:    vzeroupper
8081; X86-NEXT:    retl
8082;
8083; X64-LABEL: test_mm512_mask_reduce_min_epu64:
8084; X64:       # %bb.0: # %entry
8085; X64-NEXT:    kmovw %edi, %k1
8086; X64-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
8087; X64-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
8088; X64-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm1[4,5,6,7,0,1,2,3]
8089; X64-NEXT:    vpminuq %zmm0, %zmm1, %zmm0
8090; X64-NEXT:    vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5]
8091; X64-NEXT:    vpminuq %zmm1, %zmm0, %zmm0
8092; X64-NEXT:    vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13]
8093; X64-NEXT:    vpminuq %zmm1, %zmm0, %zmm0
8094; X64-NEXT:    vmovq %xmm0, %rax
8095; X64-NEXT:    vzeroupper
8096; X64-NEXT:    retq
8097entry:
8098  %0 = bitcast i8 %__M to <8 x i1>
8099  %1 = select <8 x i1> %0, <8 x i64> %__W, <8 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>
8100  %shuffle.i = shufflevector <8 x i64> %1, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
8101  %2 = icmp ult <8 x i64> %1, %shuffle.i
8102  %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %shuffle.i
8103  %shuffle3.i = shufflevector <8 x i64> %3, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5>
8104  %4 = icmp ult <8 x i64> %3, %shuffle3.i
8105  %5 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> %shuffle3.i
8106  %shuffle5.i = shufflevector <8 x i64> %5, <8 x i64> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
8107  %6 = icmp ult <8 x i64> %5, %shuffle5.i
8108  %7 = select <8 x i1> %6, <8 x i64> %5, <8 x i64> %shuffle5.i
8109  %vecext.i = extractelement <8 x i64> %7, i32 0
8110  ret i64 %vecext.i
8111}
8112
8113define double @test_mm512_mask_reduce_min_pd(i8 zeroext %__M, <8 x double> %__W) {
8114; X86-LABEL: test_mm512_mask_reduce_min_pd:
8115; X86:       # %bb.0: # %entry
8116; X86-NEXT:    pushl %ebp
8117; X86-NEXT:    .cfi_def_cfa_offset 8
8118; X86-NEXT:    .cfi_offset %ebp, -8
8119; X86-NEXT:    movl %esp, %ebp
8120; X86-NEXT:    .cfi_def_cfa_register %ebp
8121; X86-NEXT:    andl $-8, %esp
8122; X86-NEXT:    subl $8, %esp
8123; X86-NEXT:    movb 8(%ebp), %al
8124; X86-NEXT:    kmovw %eax, %k1
8125; X86-NEXT:    vbroadcastsd {{.*#+}} zmm1 = [+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf]
8126; X86-NEXT:    vmovapd %zmm0, %zmm1 {%k1}
8127; X86-NEXT:    vextractf64x4 $1, %zmm1, %ymm0
8128; X86-NEXT:    vminpd %ymm0, %ymm1, %ymm0
8129; X86-NEXT:    vextractf128 $1, %ymm0, %xmm1
8130; X86-NEXT:    vminpd %xmm1, %xmm0, %xmm0
8131; X86-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
8132; X86-NEXT:    vminpd %xmm1, %xmm0, %xmm0
8133; X86-NEXT:    vmovlpd %xmm0, (%esp)
8134; X86-NEXT:    fldl (%esp)
8135; X86-NEXT:    movl %ebp, %esp
8136; X86-NEXT:    popl %ebp
8137; X86-NEXT:    .cfi_def_cfa %esp, 4
8138; X86-NEXT:    vzeroupper
8139; X86-NEXT:    retl
8140;
8141; X64-LABEL: test_mm512_mask_reduce_min_pd:
8142; X64:       # %bb.0: # %entry
8143; X64-NEXT:    kmovw %edi, %k1
8144; X64-NEXT:    vbroadcastsd {{.*#+}} zmm1 = [+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf]
8145; X64-NEXT:    vmovapd %zmm0, %zmm1 {%k1}
8146; X64-NEXT:    vextractf64x4 $1, %zmm1, %ymm0
8147; X64-NEXT:    vminpd %ymm0, %ymm1, %ymm0
8148; X64-NEXT:    vextractf128 $1, %ymm0, %xmm1
8149; X64-NEXT:    vminpd %xmm1, %xmm0, %xmm0
8150; X64-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
8151; X64-NEXT:    vminpd %xmm1, %xmm0, %xmm0
8152; X64-NEXT:    vzeroupper
8153; X64-NEXT:    retq
8154entry:
8155  %0 = bitcast i8 %__M to <8 x i1>
8156  %1 = select <8 x i1> %0, <8 x double> %__W, <8 x double> <double 0x7FF0000000000000, double 0x7FF0000000000000, double 0x7FF0000000000000, double 0x7FF0000000000000, double 0x7FF0000000000000, double 0x7FF0000000000000, double 0x7FF0000000000000, double 0x7FF0000000000000>
8157  %extract.i = shufflevector <8 x double> %1, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
8158  %extract4.i = shufflevector <8 x double> %1, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
8159  %2 = tail call <4 x double> @llvm.x86.avx.min.pd.256(<4 x double> %extract.i, <4 x double> %extract4.i)
8160  %extract6.i = shufflevector <4 x double> %2, <4 x double> undef, <2 x i32> <i32 0, i32 1>
8161  %extract7.i = shufflevector <4 x double> %2, <4 x double> undef, <2 x i32> <i32 2, i32 3>
8162  %3 = tail call <2 x double> @llvm.x86.sse2.min.pd(<2 x double> %extract6.i, <2 x double> %extract7.i)
8163  %shuffle.i = shufflevector <2 x double> %3, <2 x double> undef, <2 x i32> <i32 1, i32 0>
8164  %4 = tail call <2 x double> @llvm.x86.sse2.min.pd(<2 x double> %3, <2 x double> %shuffle.i)
8165  %vecext.i = extractelement <2 x double> %4, i32 0
8166  ret double %vecext.i
8167}
8168
8169define i32 @test_mm512_reduce_max_epi32(<8 x i64> %__W) {
8170; CHECK-LABEL: test_mm512_reduce_max_epi32:
8171; CHECK:       # %bb.0: # %entry
8172; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
8173; CHECK-NEXT:    vpmaxsd %ymm1, %ymm0, %ymm0
8174; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm1
8175; CHECK-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
8176; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
8177; CHECK-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
8178; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
8179; CHECK-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
8180; CHECK-NEXT:    vmovd %xmm0, %eax
8181; CHECK-NEXT:    vzeroupper
8182; CHECK-NEXT:    ret{{[l|q]}}
8183entry:
8184  %extract.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
8185  %extract2.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
8186  %0 = bitcast <4 x i64> %extract.i to <8 x i32>
8187  %1 = bitcast <4 x i64> %extract2.i to <8 x i32>
8188  %2 = icmp sgt <8 x i32> %0, %1
8189  %3 = select <8 x i1> %2, <8 x i32> %0, <8 x i32> %1
8190  %4 = bitcast <8 x i32> %3 to <4 x i64>
8191  %extract4.i = shufflevector <4 x i64> %4, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
8192  %extract5.i = shufflevector <4 x i64> %4, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
8193  %5 = bitcast <2 x i64> %extract4.i to <4 x i32>
8194  %6 = bitcast <2 x i64> %extract5.i to <4 x i32>
8195  %7 = icmp sgt <4 x i32> %5, %6
8196  %8 = select <4 x i1> %7, <4 x i32> %5, <4 x i32> %6
8197  %shuffle.i = shufflevector <4 x i32> %8, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
8198  %9 = icmp sgt <4 x i32> %8, %shuffle.i
8199  %10 = select <4 x i1> %9, <4 x i32> %8, <4 x i32> %shuffle.i
8200  %shuffle8.i = shufflevector <4 x i32> %10, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
8201  %11 = icmp sgt <4 x i32> %10, %shuffle8.i
8202  %12 = select <4 x i1> %11, <4 x i32> %10, <4 x i32> %shuffle8.i
8203  %vecext.i = extractelement <4 x i32> %12, i32 0
8204  ret i32 %vecext.i
8205}
8206
8207define i32 @test_mm512_reduce_max_epu32(<8 x i64> %__W) {
8208; CHECK-LABEL: test_mm512_reduce_max_epu32:
8209; CHECK:       # %bb.0: # %entry
8210; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
8211; CHECK-NEXT:    vpmaxud %ymm1, %ymm0, %ymm0
8212; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm1
8213; CHECK-NEXT:    vpmaxud %xmm1, %xmm0, %xmm0
8214; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
8215; CHECK-NEXT:    vpmaxud %xmm1, %xmm0, %xmm0
8216; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
8217; CHECK-NEXT:    vpmaxud %xmm1, %xmm0, %xmm0
8218; CHECK-NEXT:    vmovd %xmm0, %eax
8219; CHECK-NEXT:    vzeroupper
8220; CHECK-NEXT:    ret{{[l|q]}}
8221entry:
8222  %extract.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
8223  %extract2.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
8224  %0 = bitcast <4 x i64> %extract.i to <8 x i32>
8225  %1 = bitcast <4 x i64> %extract2.i to <8 x i32>
8226  %2 = icmp ugt <8 x i32> %0, %1
8227  %3 = select <8 x i1> %2, <8 x i32> %0, <8 x i32> %1
8228  %4 = bitcast <8 x i32> %3 to <4 x i64>
8229  %extract4.i = shufflevector <4 x i64> %4, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
8230  %extract5.i = shufflevector <4 x i64> %4, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
8231  %5 = bitcast <2 x i64> %extract4.i to <4 x i32>
8232  %6 = bitcast <2 x i64> %extract5.i to <4 x i32>
8233  %7 = icmp ugt <4 x i32> %5, %6
8234  %8 = select <4 x i1> %7, <4 x i32> %5, <4 x i32> %6
8235  %shuffle.i = shufflevector <4 x i32> %8, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
8236  %9 = icmp ugt <4 x i32> %8, %shuffle.i
8237  %10 = select <4 x i1> %9, <4 x i32> %8, <4 x i32> %shuffle.i
8238  %shuffle8.i = shufflevector <4 x i32> %10, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
8239  %11 = icmp ugt <4 x i32> %10, %shuffle8.i
8240  %12 = select <4 x i1> %11, <4 x i32> %10, <4 x i32> %shuffle8.i
8241  %vecext.i = extractelement <4 x i32> %12, i32 0
8242  ret i32 %vecext.i
8243}
8244
8245define float @test_mm512_reduce_max_ps(<16 x float> %__W) {
8246; X86-LABEL: test_mm512_reduce_max_ps:
8247; X86:       # %bb.0: # %entry
8248; X86-NEXT:    pushl %eax
8249; X86-NEXT:    .cfi_def_cfa_offset 8
8250; X86-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
8251; X86-NEXT:    vmaxps %ymm1, %ymm0, %ymm0
8252; X86-NEXT:    vextractf128 $1, %ymm0, %xmm1
8253; X86-NEXT:    vmaxps %xmm1, %xmm0, %xmm0
8254; X86-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
8255; X86-NEXT:    vmaxps %xmm1, %xmm0, %xmm0
8256; X86-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[1,0,3,2]
8257; X86-NEXT:    vmaxps %xmm1, %xmm0, %xmm0
8258; X86-NEXT:    vmovss %xmm0, (%esp)
8259; X86-NEXT:    flds (%esp)
8260; X86-NEXT:    popl %eax
8261; X86-NEXT:    .cfi_def_cfa_offset 4
8262; X86-NEXT:    vzeroupper
8263; X86-NEXT:    retl
8264;
8265; X64-LABEL: test_mm512_reduce_max_ps:
8266; X64:       # %bb.0: # %entry
8267; X64-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
8268; X64-NEXT:    vmaxps %ymm1, %ymm0, %ymm0
8269; X64-NEXT:    vextractf128 $1, %ymm0, %xmm1
8270; X64-NEXT:    vmaxps %xmm1, %xmm0, %xmm0
8271; X64-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
8272; X64-NEXT:    vmaxps %xmm1, %xmm0, %xmm0
8273; X64-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[1,0,3,2]
8274; X64-NEXT:    vmaxps %xmm1, %xmm0, %xmm0
8275; X64-NEXT:    vzeroupper
8276; X64-NEXT:    retq
8277entry:
8278  %0 = bitcast <16 x float> %__W to <8 x double>
8279  %extract.i = shufflevector <8 x double> %0, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
8280  %1 = bitcast <4 x double> %extract.i to <8 x float>
8281  %extract2.i = shufflevector <8 x double> %0, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
8282  %2 = bitcast <4 x double> %extract2.i to <8 x float>
8283  %3 = tail call <8 x float> @llvm.x86.avx.max.ps.256(<8 x float> %1, <8 x float> %2)
8284  %extract4.i = shufflevector <8 x float> %3, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
8285  %extract5.i = shufflevector <8 x float> %3, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
8286  %4 = tail call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %extract4.i, <4 x float> %extract5.i)
8287  %shuffle.i = shufflevector <4 x float> %4, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
8288  %5 = tail call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %4, <4 x float> %shuffle.i)
8289  %shuffle8.i = shufflevector <4 x float> %5, <4 x float> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
8290  %6 = tail call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %5, <4 x float> %shuffle8.i)
8291  %vecext.i = extractelement <4 x float> %6, i32 0
8292  ret float %vecext.i
8293}
8294
8295define i32 @test_mm512_reduce_min_epi32(<8 x i64> %__W) {
8296; CHECK-LABEL: test_mm512_reduce_min_epi32:
8297; CHECK:       # %bb.0: # %entry
8298; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
8299; CHECK-NEXT:    vpminsd %ymm1, %ymm0, %ymm0
8300; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm1
8301; CHECK-NEXT:    vpminsd %xmm1, %xmm0, %xmm0
8302; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
8303; CHECK-NEXT:    vpminsd %xmm1, %xmm0, %xmm0
8304; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
8305; CHECK-NEXT:    vpminsd %xmm1, %xmm0, %xmm0
8306; CHECK-NEXT:    vmovd %xmm0, %eax
8307; CHECK-NEXT:    vzeroupper
8308; CHECK-NEXT:    ret{{[l|q]}}
8309entry:
8310  %extract.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
8311  %extract2.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
8312  %0 = bitcast <4 x i64> %extract.i to <8 x i32>
8313  %1 = bitcast <4 x i64> %extract2.i to <8 x i32>
8314  %2 = icmp slt <8 x i32> %0, %1
8315  %3 = select <8 x i1> %2, <8 x i32> %0, <8 x i32> %1
8316  %4 = bitcast <8 x i32> %3 to <4 x i64>
8317  %extract4.i = shufflevector <4 x i64> %4, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
8318  %extract5.i = shufflevector <4 x i64> %4, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
8319  %5 = bitcast <2 x i64> %extract4.i to <4 x i32>
8320  %6 = bitcast <2 x i64> %extract5.i to <4 x i32>
8321  %7 = icmp slt <4 x i32> %5, %6
8322  %8 = select <4 x i1> %7, <4 x i32> %5, <4 x i32> %6
8323  %shuffle.i = shufflevector <4 x i32> %8, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
8324  %9 = icmp slt <4 x i32> %8, %shuffle.i
8325  %10 = select <4 x i1> %9, <4 x i32> %8, <4 x i32> %shuffle.i
8326  %shuffle8.i = shufflevector <4 x i32> %10, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
8327  %11 = icmp slt <4 x i32> %10, %shuffle8.i
8328  %12 = select <4 x i1> %11, <4 x i32> %10, <4 x i32> %shuffle8.i
8329  %vecext.i = extractelement <4 x i32> %12, i32 0
8330  ret i32 %vecext.i
8331}
8332
8333define i32 @test_mm512_reduce_min_epu32(<8 x i64> %__W) {
8334; CHECK-LABEL: test_mm512_reduce_min_epu32:
8335; CHECK:       # %bb.0: # %entry
8336; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
8337; CHECK-NEXT:    vpminud %ymm1, %ymm0, %ymm0
8338; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm1
8339; CHECK-NEXT:    vpminud %xmm1, %xmm0, %xmm0
8340; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
8341; CHECK-NEXT:    vpminud %xmm1, %xmm0, %xmm0
8342; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
8343; CHECK-NEXT:    vpminud %xmm1, %xmm0, %xmm0
8344; CHECK-NEXT:    vmovd %xmm0, %eax
8345; CHECK-NEXT:    vzeroupper
8346; CHECK-NEXT:    ret{{[l|q]}}
8347entry:
8348  %extract.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
8349  %extract2.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
8350  %0 = bitcast <4 x i64> %extract.i to <8 x i32>
8351  %1 = bitcast <4 x i64> %extract2.i to <8 x i32>
8352  %2 = icmp ult <8 x i32> %0, %1
8353  %3 = select <8 x i1> %2, <8 x i32> %0, <8 x i32> %1
8354  %4 = bitcast <8 x i32> %3 to <4 x i64>
8355  %extract4.i = shufflevector <4 x i64> %4, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
8356  %extract5.i = shufflevector <4 x i64> %4, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
8357  %5 = bitcast <2 x i64> %extract4.i to <4 x i32>
8358  %6 = bitcast <2 x i64> %extract5.i to <4 x i32>
8359  %7 = icmp ult <4 x i32> %5, %6
8360  %8 = select <4 x i1> %7, <4 x i32> %5, <4 x i32> %6
8361  %shuffle.i = shufflevector <4 x i32> %8, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
8362  %9 = icmp ult <4 x i32> %8, %shuffle.i
8363  %10 = select <4 x i1> %9, <4 x i32> %8, <4 x i32> %shuffle.i
8364  %shuffle8.i = shufflevector <4 x i32> %10, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
8365  %11 = icmp ult <4 x i32> %10, %shuffle8.i
8366  %12 = select <4 x i1> %11, <4 x i32> %10, <4 x i32> %shuffle8.i
8367  %vecext.i = extractelement <4 x i32> %12, i32 0
8368  ret i32 %vecext.i
8369}
8370
8371define float @test_mm512_reduce_min_ps(<16 x float> %__W) {
8372; X86-LABEL: test_mm512_reduce_min_ps:
8373; X86:       # %bb.0: # %entry
8374; X86-NEXT:    pushl %eax
8375; X86-NEXT:    .cfi_def_cfa_offset 8
8376; X86-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
8377; X86-NEXT:    vminps %ymm1, %ymm0, %ymm0
8378; X86-NEXT:    vextractf128 $1, %ymm0, %xmm1
8379; X86-NEXT:    vminps %xmm1, %xmm0, %xmm0
8380; X86-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
8381; X86-NEXT:    vminps %xmm1, %xmm0, %xmm0
8382; X86-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[1,0,3,2]
8383; X86-NEXT:    vminps %xmm1, %xmm0, %xmm0
8384; X86-NEXT:    vmovss %xmm0, (%esp)
8385; X86-NEXT:    flds (%esp)
8386; X86-NEXT:    popl %eax
8387; X86-NEXT:    .cfi_def_cfa_offset 4
8388; X86-NEXT:    vzeroupper
8389; X86-NEXT:    retl
8390;
8391; X64-LABEL: test_mm512_reduce_min_ps:
8392; X64:       # %bb.0: # %entry
8393; X64-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
8394; X64-NEXT:    vminps %ymm1, %ymm0, %ymm0
8395; X64-NEXT:    vextractf128 $1, %ymm0, %xmm1
8396; X64-NEXT:    vminps %xmm1, %xmm0, %xmm0
8397; X64-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
8398; X64-NEXT:    vminps %xmm1, %xmm0, %xmm0
8399; X64-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[1,0,3,2]
8400; X64-NEXT:    vminps %xmm1, %xmm0, %xmm0
8401; X64-NEXT:    vzeroupper
8402; X64-NEXT:    retq
8403entry:
8404  %0 = bitcast <16 x float> %__W to <8 x double>
8405  %extract.i = shufflevector <8 x double> %0, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
8406  %1 = bitcast <4 x double> %extract.i to <8 x float>
8407  %extract2.i = shufflevector <8 x double> %0, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
8408  %2 = bitcast <4 x double> %extract2.i to <8 x float>
8409  %3 = tail call <8 x float> @llvm.x86.avx.min.ps.256(<8 x float> %1, <8 x float> %2)
8410  %extract4.i = shufflevector <8 x float> %3, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
8411  %extract5.i = shufflevector <8 x float> %3, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
8412  %4 = tail call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %extract4.i, <4 x float> %extract5.i)
8413  %shuffle.i = shufflevector <4 x float> %4, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
8414  %5 = tail call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %4, <4 x float> %shuffle.i)
8415  %shuffle8.i = shufflevector <4 x float> %5, <4 x float> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
8416  %6 = tail call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %5, <4 x float> %shuffle8.i)
8417  %vecext.i = extractelement <4 x float> %6, i32 0
8418  ret float %vecext.i
8419}
8420
8421define i32 @test_mm512_mask_reduce_max_epi32(i16 zeroext %__M, <8 x i64> %__W) {
8422; X86-LABEL: test_mm512_mask_reduce_max_epi32:
8423; X86:       # %bb.0: # %entry
8424; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
8425; X86-NEXT:    vpbroadcastd {{.*#+}} zmm1 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
8426; X86-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
8427; X86-NEXT:    vextracti64x4 $1, %zmm1, %ymm0
8428; X86-NEXT:    vpmaxsd %ymm0, %ymm1, %ymm0
8429; X86-NEXT:    vextracti128 $1, %ymm0, %xmm1
8430; X86-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
8431; X86-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
8432; X86-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
8433; X86-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
8434; X86-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
8435; X86-NEXT:    vmovd %xmm0, %eax
8436; X86-NEXT:    vzeroupper
8437; X86-NEXT:    retl
8438;
8439; X64-LABEL: test_mm512_mask_reduce_max_epi32:
8440; X64:       # %bb.0: # %entry
8441; X64-NEXT:    kmovw %edi, %k1
8442; X64-NEXT:    vpbroadcastd {{.*#+}} zmm1 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
8443; X64-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
8444; X64-NEXT:    vextracti64x4 $1, %zmm1, %ymm0
8445; X64-NEXT:    vpmaxsd %ymm0, %ymm1, %ymm0
8446; X64-NEXT:    vextracti128 $1, %ymm0, %xmm1
8447; X64-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
8448; X64-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
8449; X64-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
8450; X64-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
8451; X64-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
8452; X64-NEXT:    vmovd %xmm0, %eax
8453; X64-NEXT:    vzeroupper
8454; X64-NEXT:    retq
8455entry:
8456  %0 = bitcast <8 x i64> %__W to <16 x i32>
8457  %1 = bitcast i16 %__M to <16 x i1>
8458  %2 = select <16 x i1> %1, <16 x i32> %0, <16 x i32> <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>
8459  %3 = bitcast <16 x i32> %2 to <8 x i64>
8460  %extract.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
8461  %extract4.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
8462  %4 = bitcast <4 x i64> %extract.i to <8 x i32>
8463  %5 = bitcast <4 x i64> %extract4.i to <8 x i32>
8464  %6 = icmp sgt <8 x i32> %4, %5
8465  %7 = select <8 x i1> %6, <8 x i32> %4, <8 x i32> %5
8466  %8 = bitcast <8 x i32> %7 to <4 x i64>
8467  %extract6.i = shufflevector <4 x i64> %8, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
8468  %extract7.i = shufflevector <4 x i64> %8, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
8469  %9 = bitcast <2 x i64> %extract6.i to <4 x i32>
8470  %10 = bitcast <2 x i64> %extract7.i to <4 x i32>
8471  %11 = icmp sgt <4 x i32> %9, %10
8472  %12 = select <4 x i1> %11, <4 x i32> %9, <4 x i32> %10
8473  %shuffle.i = shufflevector <4 x i32> %12, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
8474  %13 = icmp sgt <4 x i32> %12, %shuffle.i
8475  %14 = select <4 x i1> %13, <4 x i32> %12, <4 x i32> %shuffle.i
8476  %shuffle10.i = shufflevector <4 x i32> %14, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
8477  %15 = icmp sgt <4 x i32> %14, %shuffle10.i
8478  %16 = select <4 x i1> %15, <4 x i32> %14, <4 x i32> %shuffle10.i
8479  %vecext.i = extractelement <4 x i32> %16, i32 0
8480  ret i32 %vecext.i
8481}
8482
8483define i32 @test_mm512_mask_reduce_max_epu32(i16 zeroext %__M, <8 x i64> %__W) {
8484; X86-LABEL: test_mm512_mask_reduce_max_epu32:
8485; X86:       # %bb.0: # %entry
8486; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
8487; X86-NEXT:    vmovdqa32 %zmm0, %zmm0 {%k1} {z}
8488; X86-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
8489; X86-NEXT:    vpmaxud %ymm1, %ymm0, %ymm0
8490; X86-NEXT:    vextracti128 $1, %ymm0, %xmm1
8491; X86-NEXT:    vpmaxud %xmm1, %xmm0, %xmm0
8492; X86-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
8493; X86-NEXT:    vpmaxud %xmm1, %xmm0, %xmm0
8494; X86-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
8495; X86-NEXT:    vpmaxud %xmm1, %xmm0, %xmm0
8496; X86-NEXT:    vmovd %xmm0, %eax
8497; X86-NEXT:    vzeroupper
8498; X86-NEXT:    retl
8499;
8500; X64-LABEL: test_mm512_mask_reduce_max_epu32:
8501; X64:       # %bb.0: # %entry
8502; X64-NEXT:    kmovw %edi, %k1
8503; X64-NEXT:    vmovdqa32 %zmm0, %zmm0 {%k1} {z}
8504; X64-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
8505; X64-NEXT:    vpmaxud %ymm1, %ymm0, %ymm0
8506; X64-NEXT:    vextracti128 $1, %ymm0, %xmm1
8507; X64-NEXT:    vpmaxud %xmm1, %xmm0, %xmm0
8508; X64-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
8509; X64-NEXT:    vpmaxud %xmm1, %xmm0, %xmm0
8510; X64-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
8511; X64-NEXT:    vpmaxud %xmm1, %xmm0, %xmm0
8512; X64-NEXT:    vmovd %xmm0, %eax
8513; X64-NEXT:    vzeroupper
8514; X64-NEXT:    retq
8515entry:
8516  %0 = bitcast <8 x i64> %__W to <16 x i32>
8517  %1 = bitcast i16 %__M to <16 x i1>
8518  %2 = select <16 x i1> %1, <16 x i32> %0, <16 x i32> zeroinitializer
8519  %3 = bitcast <16 x i32> %2 to <8 x i64>
8520  %extract.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
8521  %extract3.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
8522  %4 = bitcast <4 x i64> %extract.i to <8 x i32>
8523  %5 = bitcast <4 x i64> %extract3.i to <8 x i32>
8524  %6 = icmp ugt <8 x i32> %4, %5
8525  %7 = select <8 x i1> %6, <8 x i32> %4, <8 x i32> %5
8526  %8 = bitcast <8 x i32> %7 to <4 x i64>
8527  %extract5.i = shufflevector <4 x i64> %8, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
8528  %extract6.i = shufflevector <4 x i64> %8, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
8529  %9 = bitcast <2 x i64> %extract5.i to <4 x i32>
8530  %10 = bitcast <2 x i64> %extract6.i to <4 x i32>
8531  %11 = icmp ugt <4 x i32> %9, %10
8532  %12 = select <4 x i1> %11, <4 x i32> %9, <4 x i32> %10
8533  %shuffle.i = shufflevector <4 x i32> %12, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
8534  %13 = icmp ugt <4 x i32> %12, %shuffle.i
8535  %14 = select <4 x i1> %13, <4 x i32> %12, <4 x i32> %shuffle.i
8536  %shuffle9.i = shufflevector <4 x i32> %14, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
8537  %15 = icmp ugt <4 x i32> %14, %shuffle9.i
8538  %16 = select <4 x i1> %15, <4 x i32> %14, <4 x i32> %shuffle9.i
8539  %vecext.i = extractelement <4 x i32> %16, i32 0
8540  ret i32 %vecext.i
8541}
8542
8543define float @test_mm512_mask_reduce_max_ps(i16 zeroext %__M, <16 x float> %__W) {
8544; X86-LABEL: test_mm512_mask_reduce_max_ps:
8545; X86:       # %bb.0: # %entry
8546; X86-NEXT:    pushl %eax
8547; X86-NEXT:    .cfi_def_cfa_offset 8
8548; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
8549; X86-NEXT:    vbroadcastss {{.*#+}} zmm1 = [-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf]
8550; X86-NEXT:    vmovaps %zmm0, %zmm1 {%k1}
8551; X86-NEXT:    vextractf64x4 $1, %zmm1, %ymm0
8552; X86-NEXT:    vmaxps %ymm0, %ymm1, %ymm0
8553; X86-NEXT:    vextractf128 $1, %ymm0, %xmm1
8554; X86-NEXT:    vmaxps %xmm1, %xmm0, %xmm0
8555; X86-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
8556; X86-NEXT:    vmaxps %xmm1, %xmm0, %xmm0
8557; X86-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[1,0,3,2]
8558; X86-NEXT:    vmaxps %xmm1, %xmm0, %xmm0
8559; X86-NEXT:    vmovss %xmm0, (%esp)
8560; X86-NEXT:    flds (%esp)
8561; X86-NEXT:    popl %eax
8562; X86-NEXT:    .cfi_def_cfa_offset 4
8563; X86-NEXT:    vzeroupper
8564; X86-NEXT:    retl
8565;
8566; X64-LABEL: test_mm512_mask_reduce_max_ps:
8567; X64:       # %bb.0: # %entry
8568; X64-NEXT:    kmovw %edi, %k1
8569; X64-NEXT:    vbroadcastss {{.*#+}} zmm1 = [-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf]
8570; X64-NEXT:    vmovaps %zmm0, %zmm1 {%k1}
8571; X64-NEXT:    vextractf64x4 $1, %zmm1, %ymm0
8572; X64-NEXT:    vmaxps %ymm0, %ymm1, %ymm0
8573; X64-NEXT:    vextractf128 $1, %ymm0, %xmm1
8574; X64-NEXT:    vmaxps %xmm1, %xmm0, %xmm0
8575; X64-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
8576; X64-NEXT:    vmaxps %xmm1, %xmm0, %xmm0
8577; X64-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[1,0,3,2]
8578; X64-NEXT:    vmaxps %xmm1, %xmm0, %xmm0
8579; X64-NEXT:    vzeroupper
8580; X64-NEXT:    retq
8581entry:
8582  %0 = bitcast i16 %__M to <16 x i1>
8583  %1 = select <16 x i1> %0, <16 x float> %__W, <16 x float> <float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000>
8584  %2 = bitcast <16 x float> %1 to <8 x double>
8585  %extract.i = shufflevector <8 x double> %2, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
8586  %3 = bitcast <4 x double> %extract.i to <8 x float>
8587  %extract4.i = shufflevector <8 x double> %2, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
8588  %4 = bitcast <4 x double> %extract4.i to <8 x float>
8589  %5 = tail call <8 x float> @llvm.x86.avx.max.ps.256(<8 x float> %3, <8 x float> %4)
8590  %extract6.i = shufflevector <8 x float> %5, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
8591  %extract7.i = shufflevector <8 x float> %5, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
8592  %6 = tail call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %extract6.i, <4 x float> %extract7.i)
8593  %shuffle.i = shufflevector <4 x float> %6, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
8594  %7 = tail call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %6, <4 x float> %shuffle.i)
8595  %shuffle10.i = shufflevector <4 x float> %7, <4 x float> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
8596  %8 = tail call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %7, <4 x float> %shuffle10.i)
8597  %vecext.i = extractelement <4 x float> %8, i32 0
8598  ret float %vecext.i
8599}
8600
8601define i32 @test_mm512_mask_reduce_min_epi32(i16 zeroext %__M, <8 x i64> %__W) {
8602; X86-LABEL: test_mm512_mask_reduce_min_epi32:
8603; X86:       # %bb.0: # %entry
8604; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
8605; X86-NEXT:    vpbroadcastd {{.*#+}} zmm1 = [2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647]
8606; X86-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
8607; X86-NEXT:    vextracti64x4 $1, %zmm1, %ymm0
8608; X86-NEXT:    vpminsd %ymm0, %ymm1, %ymm0
8609; X86-NEXT:    vextracti128 $1, %ymm0, %xmm1
8610; X86-NEXT:    vpminsd %xmm1, %xmm0, %xmm0
8611; X86-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
8612; X86-NEXT:    vpminsd %xmm1, %xmm0, %xmm0
8613; X86-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
8614; X86-NEXT:    vpminsd %xmm1, %xmm0, %xmm0
8615; X86-NEXT:    vmovd %xmm0, %eax
8616; X86-NEXT:    vzeroupper
8617; X86-NEXT:    retl
8618;
8619; X64-LABEL: test_mm512_mask_reduce_min_epi32:
8620; X64:       # %bb.0: # %entry
8621; X64-NEXT:    kmovw %edi, %k1
8622; X64-NEXT:    vpbroadcastd {{.*#+}} zmm1 = [2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647]
8623; X64-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
8624; X64-NEXT:    vextracti64x4 $1, %zmm1, %ymm0
8625; X64-NEXT:    vpminsd %ymm0, %ymm1, %ymm0
8626; X64-NEXT:    vextracti128 $1, %ymm0, %xmm1
8627; X64-NEXT:    vpminsd %xmm1, %xmm0, %xmm0
8628; X64-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
8629; X64-NEXT:    vpminsd %xmm1, %xmm0, %xmm0
8630; X64-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
8631; X64-NEXT:    vpminsd %xmm1, %xmm0, %xmm0
8632; X64-NEXT:    vmovd %xmm0, %eax
8633; X64-NEXT:    vzeroupper
8634; X64-NEXT:    retq
8635entry:
8636  %0 = bitcast <8 x i64> %__W to <16 x i32>
8637  %1 = bitcast i16 %__M to <16 x i1>
8638  %2 = select <16 x i1> %1, <16 x i32> %0, <16 x i32> <i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647>
8639  %3 = bitcast <16 x i32> %2 to <8 x i64>
8640  %extract.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
8641  %extract4.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
8642  %4 = bitcast <4 x i64> %extract.i to <8 x i32>
8643  %5 = bitcast <4 x i64> %extract4.i to <8 x i32>
8644  %6 = icmp slt <8 x i32> %4, %5
8645  %7 = select <8 x i1> %6, <8 x i32> %4, <8 x i32> %5
8646  %8 = bitcast <8 x i32> %7 to <4 x i64>
8647  %extract6.i = shufflevector <4 x i64> %8, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
8648  %extract7.i = shufflevector <4 x i64> %8, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
8649  %9 = bitcast <2 x i64> %extract6.i to <4 x i32>
8650  %10 = bitcast <2 x i64> %extract7.i to <4 x i32>
8651  %11 = icmp slt <4 x i32> %9, %10
8652  %12 = select <4 x i1> %11, <4 x i32> %9, <4 x i32> %10
8653  %shuffle.i = shufflevector <4 x i32> %12, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
8654  %13 = icmp slt <4 x i32> %12, %shuffle.i
8655  %14 = select <4 x i1> %13, <4 x i32> %12, <4 x i32> %shuffle.i
8656  %shuffle10.i = shufflevector <4 x i32> %14, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
8657  %15 = icmp slt <4 x i32> %14, %shuffle10.i
8658  %16 = select <4 x i1> %15, <4 x i32> %14, <4 x i32> %shuffle10.i
8659  %vecext.i = extractelement <4 x i32> %16, i32 0
8660  ret i32 %vecext.i
8661}
8662
8663define i32 @test_mm512_mask_reduce_min_epu32(i16 zeroext %__M, <8 x i64> %__W) {
8664; X86-LABEL: test_mm512_mask_reduce_min_epu32:
8665; X86:       # %bb.0: # %entry
8666; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
8667; X86-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
8668; X86-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
8669; X86-NEXT:    vextracti64x4 $1, %zmm1, %ymm0
8670; X86-NEXT:    vpminud %ymm0, %ymm1, %ymm0
8671; X86-NEXT:    vextracti128 $1, %ymm0, %xmm1
8672; X86-NEXT:    vpminud %xmm1, %xmm0, %xmm0
8673; X86-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
8674; X86-NEXT:    vpminud %xmm1, %xmm0, %xmm0
8675; X86-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
8676; X86-NEXT:    vpminud %xmm1, %xmm0, %xmm0
8677; X86-NEXT:    vmovd %xmm0, %eax
8678; X86-NEXT:    vzeroupper
8679; X86-NEXT:    retl
8680;
8681; X64-LABEL: test_mm512_mask_reduce_min_epu32:
8682; X64:       # %bb.0: # %entry
8683; X64-NEXT:    kmovw %edi, %k1
8684; X64-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
8685; X64-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
8686; X64-NEXT:    vextracti64x4 $1, %zmm1, %ymm0
8687; X64-NEXT:    vpminud %ymm0, %ymm1, %ymm0
8688; X64-NEXT:    vextracti128 $1, %ymm0, %xmm1
8689; X64-NEXT:    vpminud %xmm1, %xmm0, %xmm0
8690; X64-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
8691; X64-NEXT:    vpminud %xmm1, %xmm0, %xmm0
8692; X64-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
8693; X64-NEXT:    vpminud %xmm1, %xmm0, %xmm0
8694; X64-NEXT:    vmovd %xmm0, %eax
8695; X64-NEXT:    vzeroupper
8696; X64-NEXT:    retq
8697entry:
8698  %0 = bitcast <8 x i64> %__W to <16 x i32>
8699  %1 = bitcast i16 %__M to <16 x i1>
8700  %2 = select <16 x i1> %1, <16 x i32> %0, <16 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
8701  %3 = bitcast <16 x i32> %2 to <8 x i64>
8702  %extract.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
8703  %extract4.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
8704  %4 = bitcast <4 x i64> %extract.i to <8 x i32>
8705  %5 = bitcast <4 x i64> %extract4.i to <8 x i32>
8706  %6 = icmp ult <8 x i32> %4, %5
8707  %7 = select <8 x i1> %6, <8 x i32> %4, <8 x i32> %5
8708  %8 = bitcast <8 x i32> %7 to <4 x i64>
8709  %extract6.i = shufflevector <4 x i64> %8, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
8710  %extract7.i = shufflevector <4 x i64> %8, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
8711  %9 = bitcast <2 x i64> %extract6.i to <4 x i32>
8712  %10 = bitcast <2 x i64> %extract7.i to <4 x i32>
8713  %11 = icmp ult <4 x i32> %9, %10
8714  %12 = select <4 x i1> %11, <4 x i32> %9, <4 x i32> %10
8715  %shuffle.i = shufflevector <4 x i32> %12, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
8716  %13 = icmp ult <4 x i32> %12, %shuffle.i
8717  %14 = select <4 x i1> %13, <4 x i32> %12, <4 x i32> %shuffle.i
8718  %shuffle10.i = shufflevector <4 x i32> %14, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
8719  %15 = icmp ult <4 x i32> %14, %shuffle10.i
8720  %16 = select <4 x i1> %15, <4 x i32> %14, <4 x i32> %shuffle10.i
8721  %vecext.i = extractelement <4 x i32> %16, i32 0
8722  ret i32 %vecext.i
8723}
8724
8725define float @test_mm512_mask_reduce_min_ps(i16 zeroext %__M, <16 x float> %__W) {
8726; X86-LABEL: test_mm512_mask_reduce_min_ps:
8727; X86:       # %bb.0: # %entry
8728; X86-NEXT:    pushl %eax
8729; X86-NEXT:    .cfi_def_cfa_offset 8
8730; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
8731; X86-NEXT:    vbroadcastss {{.*#+}} zmm1 = [+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf]
8732; X86-NEXT:    vmovaps %zmm0, %zmm1 {%k1}
8733; X86-NEXT:    vextractf64x4 $1, %zmm1, %ymm0
8734; X86-NEXT:    vminps %ymm0, %ymm1, %ymm0
8735; X86-NEXT:    vextractf128 $1, %ymm0, %xmm1
8736; X86-NEXT:    vminps %xmm1, %xmm0, %xmm0
8737; X86-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
8738; X86-NEXT:    vminps %xmm1, %xmm0, %xmm0
8739; X86-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[1,0,3,2]
8740; X86-NEXT:    vminps %xmm1, %xmm0, %xmm0
8741; X86-NEXT:    vmovss %xmm0, (%esp)
8742; X86-NEXT:    flds (%esp)
8743; X86-NEXT:    popl %eax
8744; X86-NEXT:    .cfi_def_cfa_offset 4
8745; X86-NEXT:    vzeroupper
8746; X86-NEXT:    retl
8747;
8748; X64-LABEL: test_mm512_mask_reduce_min_ps:
8749; X64:       # %bb.0: # %entry
8750; X64-NEXT:    kmovw %edi, %k1
8751; X64-NEXT:    vbroadcastss {{.*#+}} zmm1 = [+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf]
8752; X64-NEXT:    vmovaps %zmm0, %zmm1 {%k1}
8753; X64-NEXT:    vextractf64x4 $1, %zmm1, %ymm0
8754; X64-NEXT:    vminps %ymm0, %ymm1, %ymm0
8755; X64-NEXT:    vextractf128 $1, %ymm0, %xmm1
8756; X64-NEXT:    vminps %xmm1, %xmm0, %xmm0
8757; X64-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
8758; X64-NEXT:    vminps %xmm1, %xmm0, %xmm0
8759; X64-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[1,0,3,2]
8760; X64-NEXT:    vminps %xmm1, %xmm0, %xmm0
8761; X64-NEXT:    vzeroupper
8762; X64-NEXT:    retq
8763entry:
8764  %0 = bitcast i16 %__M to <16 x i1>
8765  %1 = select <16 x i1> %0, <16 x float> %__W, <16 x float> <float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000>
8766  %2 = bitcast <16 x float> %1 to <8 x double>
8767  %extract.i = shufflevector <8 x double> %2, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
8768  %3 = bitcast <4 x double> %extract.i to <8 x float>
8769  %extract4.i = shufflevector <8 x double> %2, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
8770  %4 = bitcast <4 x double> %extract4.i to <8 x float>
8771  %5 = tail call <8 x float> @llvm.x86.avx.min.ps.256(<8 x float> %3, <8 x float> %4)
8772  %extract6.i = shufflevector <8 x float> %5, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
8773  %extract7.i = shufflevector <8 x float> %5, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
8774  %6 = tail call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %extract6.i, <4 x float> %extract7.i)
8775  %shuffle.i = shufflevector <4 x float> %6, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
8776  %7 = tail call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %6, <4 x float> %shuffle.i)
8777  %shuffle10.i = shufflevector <4 x float> %7, <4 x float> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
8778  %8 = tail call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %7, <4 x float> %shuffle10.i)
8779  %vecext.i = extractelement <4 x float> %8, i32 0
8780  ret float %vecext.i
8781}
8782
8783define <8 x double> @test_mm512_mask_max_pd(<8 x double> %__W, i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) {
8784; X86-LABEL: test_mm512_mask_max_pd:
8785; X86:       # %bb.0: # %entry
8786; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
8787; X86-NEXT:    kmovw %eax, %k1
8788; X86-NEXT:    vmaxpd %zmm2, %zmm1, %zmm0 {%k1}
8789; X86-NEXT:    retl
8790;
8791; X64-LABEL: test_mm512_mask_max_pd:
8792; X64:       # %bb.0: # %entry
8793; X64-NEXT:    kmovw %edi, %k1
8794; X64-NEXT:    vmaxpd %zmm2, %zmm1, %zmm0 {%k1}
8795; X64-NEXT:    retq
8796entry:
8797  %0 = tail call <8 x double> @llvm.x86.avx512.max.pd.512(<8 x double> %__A, <8 x double> %__B, i32 4)
8798  %1 = bitcast i8 %__U to <8 x i1>
8799  %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__W
8800  ret <8 x double> %2
8801}
8802
8803define <8 x double> @test_mm512_maskz_max_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) {
8804; X86-LABEL: test_mm512_maskz_max_pd:
8805; X86:       # %bb.0: # %entry
8806; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
8807; X86-NEXT:    kmovw %eax, %k1
8808; X86-NEXT:    vmaxpd %zmm1, %zmm0, %zmm0 {%k1} {z}
8809; X86-NEXT:    retl
8810;
8811; X64-LABEL: test_mm512_maskz_max_pd:
8812; X64:       # %bb.0: # %entry
8813; X64-NEXT:    kmovw %edi, %k1
8814; X64-NEXT:    vmaxpd %zmm1, %zmm0, %zmm0 {%k1} {z}
8815; X64-NEXT:    retq
8816entry:
8817  %0 = tail call <8 x double> @llvm.x86.avx512.max.pd.512(<8 x double> %__A, <8 x double> %__B, i32 4)
8818  %1 = bitcast i8 %__U to <8 x i1>
8819  %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer
8820  ret <8 x double> %2
8821}
8822
8823define <16 x float> @test_mm512_mask_max_ps(<16 x float> %__W, i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) {
8824; X86-LABEL: test_mm512_mask_max_ps:
8825; X86:       # %bb.0: # %entry
8826; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
8827; X86-NEXT:    vmaxps %zmm2, %zmm1, %zmm0 {%k1}
8828; X86-NEXT:    retl
8829;
8830; X64-LABEL: test_mm512_mask_max_ps:
8831; X64:       # %bb.0: # %entry
8832; X64-NEXT:    kmovw %edi, %k1
8833; X64-NEXT:    vmaxps %zmm2, %zmm1, %zmm0 {%k1}
8834; X64-NEXT:    retq
8835entry:
8836  %0 = tail call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> %__A, <16 x float> %__B, i32 4)
8837  %1 = bitcast i16 %__U to <16 x i1>
8838  %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__W
8839  ret <16 x float> %2
8840}
8841
8842define <8 x double> @test_mm512_mask_max_round_pd(<8 x double> %__W, i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) {
8843; X86-LABEL: test_mm512_mask_max_round_pd:
8844; X86:       # %bb.0: # %entry
8845; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
8846; X86-NEXT:    kmovw %eax, %k1
8847; X86-NEXT:    vmaxpd %zmm2, %zmm1, %zmm0 {%k1}
8848; X86-NEXT:    retl
8849;
8850; X64-LABEL: test_mm512_mask_max_round_pd:
8851; X64:       # %bb.0: # %entry
8852; X64-NEXT:    kmovw %edi, %k1
8853; X64-NEXT:    vmaxpd %zmm2, %zmm1, %zmm0 {%k1}
8854; X64-NEXT:    retq
8855entry:
8856  %0 = tail call <8 x double> @llvm.x86.avx512.max.pd.512(<8 x double> %__A, <8 x double> %__B, i32 4)
8857  %1 = bitcast i8 %__U to <8 x i1>
8858  %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__W
8859  ret <8 x double> %2
8860}
8861
8862declare <8 x double> @llvm.x86.avx512.max.pd.512(<8 x double>, <8 x double>, i32)
8863
8864define <8 x double> @test_mm512_maskz_max_round_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) {
8865; X86-LABEL: test_mm512_maskz_max_round_pd:
8866; X86:       # %bb.0: # %entry
8867; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
8868; X86-NEXT:    kmovw %eax, %k1
8869; X86-NEXT:    vmaxpd %zmm1, %zmm0, %zmm0 {%k1} {z}
8870; X86-NEXT:    retl
8871;
8872; X64-LABEL: test_mm512_maskz_max_round_pd:
8873; X64:       # %bb.0: # %entry
8874; X64-NEXT:    kmovw %edi, %k1
8875; X64-NEXT:    vmaxpd %zmm1, %zmm0, %zmm0 {%k1} {z}
8876; X64-NEXT:    retq
8877entry:
8878  %0 = tail call <8 x double> @llvm.x86.avx512.max.pd.512(<8 x double> %__A, <8 x double> %__B, i32 4)
8879  %1 = bitcast i8 %__U to <8 x i1>
8880  %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer
8881  ret <8 x double> %2
8882}
8883
8884define <8 x double> @test_mm512_max_round_pd(<8 x double> %__A, <8 x double> %__B) {
8885; CHECK-LABEL: test_mm512_max_round_pd:
8886; CHECK:       # %bb.0: # %entry
8887; CHECK-NEXT:    vmaxpd %zmm1, %zmm0, %zmm0
8888; CHECK-NEXT:    ret{{[l|q]}}
8889entry:
8890  %0 = tail call <8 x double> @llvm.x86.avx512.max.pd.512(<8 x double> %__A, <8 x double> %__B, i32 4)
8891  ret <8 x double> %0
8892}
8893
8894define <16 x float> @test_mm512_maskz_max_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) {
8895; X86-LABEL: test_mm512_maskz_max_ps:
8896; X86:       # %bb.0: # %entry
8897; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
8898; X86-NEXT:    vmaxps %zmm1, %zmm0, %zmm0 {%k1} {z}
8899; X86-NEXT:    retl
8900;
8901; X64-LABEL: test_mm512_maskz_max_ps:
8902; X64:       # %bb.0: # %entry
8903; X64-NEXT:    kmovw %edi, %k1
8904; X64-NEXT:    vmaxps %zmm1, %zmm0, %zmm0 {%k1} {z}
8905; X64-NEXT:    retq
8906entry:
8907  %0 = tail call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> %__A, <16 x float> %__B, i32 4)
8908  %1 = bitcast i16 %__U to <16 x i1>
8909  %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer
8910  ret <16 x float> %2
8911}
8912
8913define <16 x float> @test_mm512_mask_max_round_ps(<16 x float> %__W, i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) {
8914; X86-LABEL: test_mm512_mask_max_round_ps:
8915; X86:       # %bb.0: # %entry
8916; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
8917; X86-NEXT:    vmaxps %zmm2, %zmm1, %zmm0 {%k1}
8918; X86-NEXT:    retl
8919;
8920; X64-LABEL: test_mm512_mask_max_round_ps:
8921; X64:       # %bb.0: # %entry
8922; X64-NEXT:    kmovw %edi, %k1
8923; X64-NEXT:    vmaxps %zmm2, %zmm1, %zmm0 {%k1}
8924; X64-NEXT:    retq
8925entry:
8926  %0 = tail call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> %__A, <16 x float> %__B, i32 4)
8927  %1 = bitcast i16 %__U to <16 x i1>
8928  %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__W
8929  ret <16 x float> %2
8930}
8931
8932declare <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float>, <16 x float>, i32)
8933
8934define <16 x float> @test_mm512_maskz_max_round_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) {
8935; X86-LABEL: test_mm512_maskz_max_round_ps:
8936; X86:       # %bb.0: # %entry
8937; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
8938; X86-NEXT:    vmaxps %zmm1, %zmm0, %zmm0 {%k1} {z}
8939; X86-NEXT:    retl
8940;
8941; X64-LABEL: test_mm512_maskz_max_round_ps:
8942; X64:       # %bb.0: # %entry
8943; X64-NEXT:    kmovw %edi, %k1
8944; X64-NEXT:    vmaxps %zmm1, %zmm0, %zmm0 {%k1} {z}
8945; X64-NEXT:    retq
8946entry:
8947  %0 = tail call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> %__A, <16 x float> %__B, i32 4)
8948  %1 = bitcast i16 %__U to <16 x i1>
8949  %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer
8950  ret <16 x float> %2
8951}
8952
8953define <16 x float> @test_mm512_max_round_ps(<16 x float> %__A, <16 x float> %__B) {
8954; CHECK-LABEL: test_mm512_max_round_ps:
8955; CHECK:       # %bb.0: # %entry
8956; CHECK-NEXT:    vmaxps %zmm1, %zmm0, %zmm0
8957; CHECK-NEXT:    ret{{[l|q]}}
8958entry:
8959  %0 = tail call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> %__A, <16 x float> %__B, i32 4)
8960  ret <16 x float> %0
8961}
8962
8963define <8 x double> @test_mm512_mask_min_pd(<8 x double> %__W, i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) {
8964; X86-LABEL: test_mm512_mask_min_pd:
8965; X86:       # %bb.0: # %entry
8966; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
8967; X86-NEXT:    kmovw %eax, %k1
8968; X86-NEXT:    vminpd %zmm2, %zmm1, %zmm0 {%k1}
8969; X86-NEXT:    retl
8970;
8971; X64-LABEL: test_mm512_mask_min_pd:
8972; X64:       # %bb.0: # %entry
8973; X64-NEXT:    kmovw %edi, %k1
8974; X64-NEXT:    vminpd %zmm2, %zmm1, %zmm0 {%k1}
8975; X64-NEXT:    retq
8976entry:
8977  %0 = tail call <8 x double> @llvm.x86.avx512.min.pd.512(<8 x double> %__A, <8 x double> %__B, i32 4)
8978  %1 = bitcast i8 %__U to <8 x i1>
8979  %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__W
8980  ret <8 x double> %2
8981}
8982
8983define <8 x double> @test_mm512_maskz_min_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) {
8984; X86-LABEL: test_mm512_maskz_min_pd:
8985; X86:       # %bb.0: # %entry
8986; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
8987; X86-NEXT:    kmovw %eax, %k1
8988; X86-NEXT:    vminpd %zmm1, %zmm0, %zmm0 {%k1} {z}
8989; X86-NEXT:    retl
8990;
8991; X64-LABEL: test_mm512_maskz_min_pd:
8992; X64:       # %bb.0: # %entry
8993; X64-NEXT:    kmovw %edi, %k1
8994; X64-NEXT:    vminpd %zmm1, %zmm0, %zmm0 {%k1} {z}
8995; X64-NEXT:    retq
8996entry:
8997  %0 = tail call <8 x double> @llvm.x86.avx512.min.pd.512(<8 x double> %__A, <8 x double> %__B, i32 4)
8998  %1 = bitcast i8 %__U to <8 x i1>
8999  %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer
9000  ret <8 x double> %2
9001}
9002
9003define <8 x double> @test_mm512_mask_min_round_pd(<8 x double> %__W, i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) {
9004; X86-LABEL: test_mm512_mask_min_round_pd:
9005; X86:       # %bb.0: # %entry
9006; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
9007; X86-NEXT:    kmovw %eax, %k1
9008; X86-NEXT:    vminpd %zmm2, %zmm1, %zmm0 {%k1}
9009; X86-NEXT:    retl
9010;
9011; X64-LABEL: test_mm512_mask_min_round_pd:
9012; X64:       # %bb.0: # %entry
9013; X64-NEXT:    kmovw %edi, %k1
9014; X64-NEXT:    vminpd %zmm2, %zmm1, %zmm0 {%k1}
9015; X64-NEXT:    retq
9016entry:
9017  %0 = tail call <8 x double> @llvm.x86.avx512.min.pd.512(<8 x double> %__A, <8 x double> %__B, i32 4)
9018  %1 = bitcast i8 %__U to <8 x i1>
9019  %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__W
9020  ret <8 x double> %2
9021}
9022
9023declare <8 x double> @llvm.x86.avx512.min.pd.512(<8 x double>, <8 x double>, i32)
9024
9025define <8 x double> @test_mm512_maskz_min_round_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) {
9026; X86-LABEL: test_mm512_maskz_min_round_pd:
9027; X86:       # %bb.0: # %entry
9028; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
9029; X86-NEXT:    kmovw %eax, %k1
9030; X86-NEXT:    vminpd %zmm1, %zmm0, %zmm0 {%k1} {z}
9031; X86-NEXT:    retl
9032;
9033; X64-LABEL: test_mm512_maskz_min_round_pd:
9034; X64:       # %bb.0: # %entry
9035; X64-NEXT:    kmovw %edi, %k1
9036; X64-NEXT:    vminpd %zmm1, %zmm0, %zmm0 {%k1} {z}
9037; X64-NEXT:    retq
9038entry:
9039  %0 = tail call <8 x double> @llvm.x86.avx512.min.pd.512(<8 x double> %__A, <8 x double> %__B, i32 4)
9040  %1 = bitcast i8 %__U to <8 x i1>
9041  %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer
9042  ret <8 x double> %2
9043}
9044
9045define <8 x double> @test_mm512_min_round_pd(<8 x double> %__A, <8 x double> %__B) {
9046; CHECK-LABEL: test_mm512_min_round_pd:
9047; CHECK:       # %bb.0: # %entry
9048; CHECK-NEXT:    vminpd %zmm1, %zmm0, %zmm0
9049; CHECK-NEXT:    ret{{[l|q]}}
9050entry:
9051  %0 = tail call <8 x double> @llvm.x86.avx512.min.pd.512(<8 x double> %__A, <8 x double> %__B, i32 4)
9052  ret <8 x double> %0
9053}
9054
9055define <16 x float> @test_mm512_mask_min_ps(<16 x float> %__W, i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) {
9056; X86-LABEL: test_mm512_mask_min_ps:
9057; X86:       # %bb.0: # %entry
9058; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
9059; X86-NEXT:    vminps %zmm2, %zmm1, %zmm0 {%k1}
9060; X86-NEXT:    retl
9061;
9062; X64-LABEL: test_mm512_mask_min_ps:
9063; X64:       # %bb.0: # %entry
9064; X64-NEXT:    kmovw %edi, %k1
9065; X64-NEXT:    vminps %zmm2, %zmm1, %zmm0 {%k1}
9066; X64-NEXT:    retq
9067entry:
9068  %0 = tail call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> %__A, <16 x float> %__B, i32 4)
9069  %1 = bitcast i16 %__U to <16 x i1>
9070  %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__W
9071  ret <16 x float> %2
9072}
9073
9074define <16 x float> @test_mm512_maskz_min_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) {
9075; X86-LABEL: test_mm512_maskz_min_ps:
9076; X86:       # %bb.0: # %entry
9077; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
9078; X86-NEXT:    vminps %zmm1, %zmm0, %zmm0 {%k1} {z}
9079; X86-NEXT:    retl
9080;
9081; X64-LABEL: test_mm512_maskz_min_ps:
9082; X64:       # %bb.0: # %entry
9083; X64-NEXT:    kmovw %edi, %k1
9084; X64-NEXT:    vminps %zmm1, %zmm0, %zmm0 {%k1} {z}
9085; X64-NEXT:    retq
9086entry:
9087  %0 = tail call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> %__A, <16 x float> %__B, i32 4)
9088  %1 = bitcast i16 %__U to <16 x i1>
9089  %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer
9090  ret <16 x float> %2
9091}
9092
9093define <16 x float> @test_mm512_mask_min_round_ps(<16 x float> %__W, i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) {
9094; X86-LABEL: test_mm512_mask_min_round_ps:
9095; X86:       # %bb.0: # %entry
9096; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
9097; X86-NEXT:    vminps %zmm2, %zmm1, %zmm0 {%k1}
9098; X86-NEXT:    retl
9099;
9100; X64-LABEL: test_mm512_mask_min_round_ps:
9101; X64:       # %bb.0: # %entry
9102; X64-NEXT:    kmovw %edi, %k1
9103; X64-NEXT:    vminps %zmm2, %zmm1, %zmm0 {%k1}
9104; X64-NEXT:    retq
9105entry:
9106  %0 = tail call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> %__A, <16 x float> %__B, i32 4)
9107  %1 = bitcast i16 %__U to <16 x i1>
9108  %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__W
9109  ret <16 x float> %2
9110}
9111
9112declare <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float>, <16 x float>, i32)
9113
9114define <16 x float> @test_mm512_maskz_min_round_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) {
9115; X86-LABEL: test_mm512_maskz_min_round_ps:
9116; X86:       # %bb.0: # %entry
9117; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
9118; X86-NEXT:    vminps %zmm1, %zmm0, %zmm0 {%k1} {z}
9119; X86-NEXT:    retl
9120;
9121; X64-LABEL: test_mm512_maskz_min_round_ps:
9122; X64:       # %bb.0: # %entry
9123; X64-NEXT:    kmovw %edi, %k1
9124; X64-NEXT:    vminps %zmm1, %zmm0, %zmm0 {%k1} {z}
9125; X64-NEXT:    retq
9126entry:
9127  %0 = tail call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> %__A, <16 x float> %__B, i32 4)
9128  %1 = bitcast i16 %__U to <16 x i1>
9129  %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer
9130  ret <16 x float> %2
9131}
9132
9133define <16 x float> @test_mm512_min_round_ps(<16 x float> %__A, <16 x float> %__B) {
9134; CHECK-LABEL: test_mm512_min_round_ps:
9135; CHECK:       # %bb.0: # %entry
9136; CHECK-NEXT:    vminps %zmm1, %zmm0, %zmm0
9137; CHECK-NEXT:    ret{{[l|q]}}
9138entry:
9139  %0 = tail call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> %__A, <16 x float> %__B, i32 4)
9140  ret <16 x float> %0
9141}
9142
9143define <8 x double> @test_mm512_sqrt_pd(<8 x double> %a) {
9144; CHECK-LABEL: test_mm512_sqrt_pd:
9145; CHECK:       # %bb.0: # %entry
9146; CHECK-NEXT:    vsqrtpd %zmm0, %zmm0
9147; CHECK-NEXT:    ret{{[l|q]}}
9148entry:
9149  %0 = tail call <8 x double> @llvm.sqrt.v8f64(<8 x double> %a)
9150  ret <8 x double> %0
9151}
9152
9153define <8 x double> @test_mm512_mask_sqrt_pd(<8 x double> %__W, i8 zeroext %__U, <8 x double> %__A) {
9154; X86-LABEL: test_mm512_mask_sqrt_pd:
9155; X86:       # %bb.0: # %entry
9156; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
9157; X86-NEXT:    kmovw %eax, %k1
9158; X86-NEXT:    vsqrtpd %zmm1, %zmm0 {%k1}
9159; X86-NEXT:    retl
9160;
9161; X64-LABEL: test_mm512_mask_sqrt_pd:
9162; X64:       # %bb.0: # %entry
9163; X64-NEXT:    kmovw %edi, %k1
9164; X64-NEXT:    vsqrtpd %zmm1, %zmm0 {%k1}
9165; X64-NEXT:    retq
9166entry:
9167  %0 = tail call <8 x double> @llvm.sqrt.v8f64(<8 x double> %__A)
9168  %1 = bitcast i8 %__U to <8 x i1>
9169  %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__W
9170  ret <8 x double> %2
9171}
9172
9173define <8 x double> @test_mm512_maskz_sqrt_pd(i8 zeroext %__U, <8 x double> %__A) {
9174; X86-LABEL: test_mm512_maskz_sqrt_pd:
9175; X86:       # %bb.0: # %entry
9176; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
9177; X86-NEXT:    kmovw %eax, %k1
9178; X86-NEXT:    vsqrtpd %zmm0, %zmm0 {%k1} {z}
9179; X86-NEXT:    retl
9180;
9181; X64-LABEL: test_mm512_maskz_sqrt_pd:
9182; X64:       # %bb.0: # %entry
9183; X64-NEXT:    kmovw %edi, %k1
9184; X64-NEXT:    vsqrtpd %zmm0, %zmm0 {%k1} {z}
9185; X64-NEXT:    retq
9186entry:
9187  %0 = tail call <8 x double> @llvm.sqrt.v8f64(<8 x double> %__A)
9188  %1 = bitcast i8 %__U to <8 x i1>
9189  %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer
9190  ret <8 x double> %2
9191}
9192
9193define <8 x double> @test_mm512_mask_sqrt_round_pd(<8 x double> %__W, i8 zeroext %__U, <8 x double> %__A) {
9194; X86-LABEL: test_mm512_mask_sqrt_round_pd:
9195; X86:       # %bb.0: # %entry
9196; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
9197; X86-NEXT:    kmovw %eax, %k1
9198; X86-NEXT:    vsqrtpd {rn-sae}, %zmm1, %zmm0 {%k1}
9199; X86-NEXT:    retl
9200;
9201; X64-LABEL: test_mm512_mask_sqrt_round_pd:
9202; X64:       # %bb.0: # %entry
9203; X64-NEXT:    kmovw %edi, %k1
9204; X64-NEXT:    vsqrtpd {rn-sae}, %zmm1, %zmm0 {%k1}
9205; X64-NEXT:    retq
9206entry:
9207  %0 = tail call <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double> %__A, i32 8)
9208  %1 = bitcast i8 %__U to <8 x i1>
9209  %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__W
9210  ret <8 x double> %2
9211}
9212
9213declare <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double>, i32)
9214
9215define <8 x double> @test_mm512_maskz_sqrt_round_pd(i8 zeroext %__U, <8 x double> %__A) {
9216; X86-LABEL: test_mm512_maskz_sqrt_round_pd:
9217; X86:       # %bb.0: # %entry
9218; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
9219; X86-NEXT:    kmovw %eax, %k1
9220; X86-NEXT:    vsqrtpd {rn-sae}, %zmm0, %zmm0 {%k1} {z}
9221; X86-NEXT:    retl
9222;
9223; X64-LABEL: test_mm512_maskz_sqrt_round_pd:
9224; X64:       # %bb.0: # %entry
9225; X64-NEXT:    kmovw %edi, %k1
9226; X64-NEXT:    vsqrtpd {rn-sae}, %zmm0, %zmm0 {%k1} {z}
9227; X64-NEXT:    retq
9228entry:
9229  %0 = tail call <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double> %__A, i32 8)
9230  %1 = bitcast i8 %__U to <8 x i1>
9231  %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer
9232  ret <8 x double> %2
9233}
9234
9235define <8 x double> @test_mm512_sqrt_round_pd(<8 x double> %__A) {
9236; CHECK-LABEL: test_mm512_sqrt_round_pd:
9237; CHECK:       # %bb.0: # %entry
9238; CHECK-NEXT:    vsqrtpd {rn-sae}, %zmm0, %zmm0
9239; CHECK-NEXT:    ret{{[l|q]}}
9240entry:
9241  %0 = tail call <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double> %__A, i32 8)
9242  ret <8 x double> %0
9243}
9244
9245define <16 x float> @test_mm512_sqrt_ps(<16 x float> %a) {
9246; CHECK-LABEL: test_mm512_sqrt_ps:
9247; CHECK:       # %bb.0: # %entry
9248; CHECK-NEXT:    vsqrtps %zmm0, %zmm0
9249; CHECK-NEXT:    ret{{[l|q]}}
9250entry:
9251  %0 = tail call <16 x float> @llvm.sqrt.v16f32(<16 x float> %a)
9252  ret <16 x float> %0
9253}
9254
9255define <16 x float> @test_mm512_mask_sqrt_ps(<16 x float> %__W, i16 zeroext %__U, <16 x float> %__A) {
9256; X86-LABEL: test_mm512_mask_sqrt_ps:
9257; X86:       # %bb.0: # %entry
9258; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
9259; X86-NEXT:    vsqrtps %zmm1, %zmm0 {%k1}
9260; X86-NEXT:    retl
9261;
9262; X64-LABEL: test_mm512_mask_sqrt_ps:
9263; X64:       # %bb.0: # %entry
9264; X64-NEXT:    kmovw %edi, %k1
9265; X64-NEXT:    vsqrtps %zmm1, %zmm0 {%k1}
9266; X64-NEXT:    retq
9267entry:
9268  %0 = tail call <16 x float> @llvm.sqrt.v16f32(<16 x float> %__A)
9269  %1 = bitcast i16 %__U to <16 x i1>
9270  %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__W
9271  ret <16 x float> %2
9272}
9273
9274define <16 x float> @test_mm512_maskz_sqrt_ps(i16 zeroext %__U, <16 x float> %__A) {
9275; X86-LABEL: test_mm512_maskz_sqrt_ps:
9276; X86:       # %bb.0: # %entry
9277; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
9278; X86-NEXT:    vsqrtps %zmm0, %zmm0 {%k1} {z}
9279; X86-NEXT:    retl
9280;
9281; X64-LABEL: test_mm512_maskz_sqrt_ps:
9282; X64:       # %bb.0: # %entry
9283; X64-NEXT:    kmovw %edi, %k1
9284; X64-NEXT:    vsqrtps %zmm0, %zmm0 {%k1} {z}
9285; X64-NEXT:    retq
9286entry:
9287  %0 = tail call <16 x float> @llvm.sqrt.v16f32(<16 x float> %__A)
9288  %1 = bitcast i16 %__U to <16 x i1>
9289  %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer
9290  ret <16 x float> %2
9291}
9292
9293define <16 x float> @test_mm512_mask_sqrt_round_ps(<16 x float> %__W, i16 zeroext %__U, <16 x float> %__A) {
9294; X86-LABEL: test_mm512_mask_sqrt_round_ps:
9295; X86:       # %bb.0: # %entry
9296; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
9297; X86-NEXT:    vsqrtps {rn-sae}, %zmm1, %zmm0 {%k1}
9298; X86-NEXT:    retl
9299;
9300; X64-LABEL: test_mm512_mask_sqrt_round_ps:
9301; X64:       # %bb.0: # %entry
9302; X64-NEXT:    kmovw %edi, %k1
9303; X64-NEXT:    vsqrtps {rn-sae}, %zmm1, %zmm0 {%k1}
9304; X64-NEXT:    retq
9305entry:
9306  %0 = tail call <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float> %__A, i32 8)
9307  %1 = bitcast i16 %__U to <16 x i1>
9308  %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__W
9309  ret <16 x float> %2
9310}
9311
9312declare <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float>, i32)
9313
9314define <16 x float> @test_mm512_maskz_sqrt_round_ps(i16 zeroext %__U, <16 x float> %__A) {
9315; X86-LABEL: test_mm512_maskz_sqrt_round_ps:
9316; X86:       # %bb.0: # %entry
9317; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
9318; X86-NEXT:    vsqrtps {rn-sae}, %zmm0, %zmm0 {%k1} {z}
9319; X86-NEXT:    retl
9320;
9321; X64-LABEL: test_mm512_maskz_sqrt_round_ps:
9322; X64:       # %bb.0: # %entry
9323; X64-NEXT:    kmovw %edi, %k1
9324; X64-NEXT:    vsqrtps {rn-sae}, %zmm0, %zmm0 {%k1} {z}
9325; X64-NEXT:    retq
9326entry:
9327  %0 = tail call <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float> %__A, i32 8)
9328  %1 = bitcast i16 %__U to <16 x i1>
9329  %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer
9330  ret <16 x float> %2
9331}
9332
9333define <16 x float> @test_mm512_sqrt_round_ps(<16 x float> %__A) {
9334; CHECK-LABEL: test_mm512_sqrt_round_ps:
9335; CHECK:       # %bb.0: # %entry
9336; CHECK-NEXT:    vsqrtps {rn-sae}, %zmm0, %zmm0
9337; CHECK-NEXT:    ret{{[l|q]}}
9338entry:
9339  %0 = tail call <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float> %__A, i32 8)
9340  ret <16 x float> %0
9341}
9342
9343define <8 x i64> @test_mm512_rol_epi32(<8 x i64> %__A) local_unnamed_addr #0 {
9344; CHECK-LABEL: test_mm512_rol_epi32:
9345; CHECK:       # %bb.0: # %entry
9346; CHECK-NEXT:    vprold $5, %zmm0, %zmm0
9347; CHECK-NEXT:    ret{{[l|q]}}
9348entry:
9349  %0 = bitcast <8 x i64> %__A to <16 x i32>
9350  %1 = tail call <16 x i32> @llvm.x86.avx512.prol.d.512(<16 x i32> %0, i32 5)
9351  %2 = bitcast <16 x i32> %1 to <8 x i64>
9352  ret <8 x i64> %2
9353}
9354
9355declare <16 x i32> @llvm.x86.avx512.prol.d.512(<16 x i32>, i32) #1
9356
9357define <8 x i64> @test_mm512_mask_rol_epi32(<8 x i64> %__W, i16 zeroext %__U, <8 x i64> %__A) {
9358; X86-LABEL: test_mm512_mask_rol_epi32:
9359; X86:       # %bb.0: # %entry
9360; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
9361; X86-NEXT:    vprold $5, %zmm1, %zmm0 {%k1}
9362; X86-NEXT:    retl
9363;
9364; X64-LABEL: test_mm512_mask_rol_epi32:
9365; X64:       # %bb.0: # %entry
9366; X64-NEXT:    kmovw %edi, %k1
9367; X64-NEXT:    vprold $5, %zmm1, %zmm0 {%k1}
9368; X64-NEXT:    retq
9369entry:
9370  %0 = bitcast <8 x i64> %__A to <16 x i32>
9371  %1 = tail call <16 x i32> @llvm.x86.avx512.prol.d.512(<16 x i32> %0, i32 5)
9372  %2 = bitcast <8 x i64> %__W to <16 x i32>
9373  %3 = bitcast i16 %__U to <16 x i1>
9374  %4 = select <16 x i1> %3, <16 x i32> %1, <16 x i32> %2
9375  %5 = bitcast <16 x i32> %4 to <8 x i64>
9376  ret <8 x i64> %5
9377}
9378
9379define <8 x i64> @test_mm512_maskz_rol_epi32(i16 zeroext %__U, <8 x i64> %__A) {
9380; X86-LABEL: test_mm512_maskz_rol_epi32:
9381; X86:       # %bb.0: # %entry
9382; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
9383; X86-NEXT:    vprold $5, %zmm0, %zmm0 {%k1} {z}
9384; X86-NEXT:    retl
9385;
9386; X64-LABEL: test_mm512_maskz_rol_epi32:
9387; X64:       # %bb.0: # %entry
9388; X64-NEXT:    kmovw %edi, %k1
9389; X64-NEXT:    vprold $5, %zmm0, %zmm0 {%k1} {z}
9390; X64-NEXT:    retq
9391entry:
9392  %0 = bitcast <8 x i64> %__A to <16 x i32>
9393  %1 = tail call <16 x i32> @llvm.x86.avx512.prol.d.512(<16 x i32> %0, i32 5)
9394  %2 = bitcast i16 %__U to <16 x i1>
9395  %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> zeroinitializer
9396  %4 = bitcast <16 x i32> %3 to <8 x i64>
9397  ret <8 x i64> %4
9398}
9399
9400define <8 x i64> @test_mm512_rol_epi64(<8 x i64> %__A) {
9401; CHECK-LABEL: test_mm512_rol_epi64:
9402; CHECK:       # %bb.0: # %entry
9403; CHECK-NEXT:    vprolq $5, %zmm0, %zmm0
9404; CHECK-NEXT:    ret{{[l|q]}}
9405entry:
9406  %0 = tail call <8 x i64> @llvm.x86.avx512.prol.q.512(<8 x i64> %__A, i32 5)
9407  ret <8 x i64> %0
9408}
9409
9410declare <8 x i64> @llvm.x86.avx512.prol.q.512(<8 x i64>, i32) #1
9411
9412define <8 x i64> @test_mm512_mask_rol_epi64(<8 x i64> %__W, i8 zeroext %__U, <8 x i64> %__A) {
9413; X86-LABEL: test_mm512_mask_rol_epi64:
9414; X86:       # %bb.0: # %entry
9415; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
9416; X86-NEXT:    kmovw %eax, %k1
9417; X86-NEXT:    vprolq $5, %zmm1, %zmm0 {%k1}
9418; X86-NEXT:    retl
9419;
9420; X64-LABEL: test_mm512_mask_rol_epi64:
9421; X64:       # %bb.0: # %entry
9422; X64-NEXT:    kmovw %edi, %k1
9423; X64-NEXT:    vprolq $5, %zmm1, %zmm0 {%k1}
9424; X64-NEXT:    retq
9425entry:
9426  %0 = tail call <8 x i64> @llvm.x86.avx512.prol.q.512(<8 x i64> %__A, i32 5)
9427  %1 = bitcast i8 %__U to <8 x i1>
9428  %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> %__W
9429  ret <8 x i64> %2
9430}
9431
9432define <8 x i64> @test_mm512_maskz_rol_epi64(i8 zeroext %__U, <8 x i64> %__A) {
9433; X86-LABEL: test_mm512_maskz_rol_epi64:
9434; X86:       # %bb.0: # %entry
9435; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
9436; X86-NEXT:    kmovw %eax, %k1
9437; X86-NEXT:    vprolq $5, %zmm0, %zmm0 {%k1} {z}
9438; X86-NEXT:    retl
9439;
9440; X64-LABEL: test_mm512_maskz_rol_epi64:
9441; X64:       # %bb.0: # %entry
9442; X64-NEXT:    kmovw %edi, %k1
9443; X64-NEXT:    vprolq $5, %zmm0, %zmm0 {%k1} {z}
9444; X64-NEXT:    retq
9445entry:
9446  %0 = tail call <8 x i64> @llvm.x86.avx512.prol.q.512(<8 x i64> %__A, i32 5)
9447  %1 = bitcast i8 %__U to <8 x i1>
9448  %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> zeroinitializer
9449  ret <8 x i64> %2
9450}
9451
9452define <8 x i64> @test_mm512_rolv_epi32(<8 x i64> %__A, <8 x i64> %__B) {
9453; CHECK-LABEL: test_mm512_rolv_epi32:
9454; CHECK:       # %bb.0: # %entry
9455; CHECK-NEXT:    vprolvd %zmm1, %zmm0, %zmm0
9456; CHECK-NEXT:    ret{{[l|q]}}
9457entry:
9458  %0 = bitcast <8 x i64> %__A to <16 x i32>
9459  %1 = bitcast <8 x i64> %__B to <16 x i32>
9460  %2 = tail call <16 x i32> @llvm.x86.avx512.prolv.d.512(<16 x i32> %0, <16 x i32> %1)
9461  %3 = bitcast <16 x i32> %2 to <8 x i64>
9462  ret <8 x i64> %3
9463}
9464
9465define <8 x i64> @test_mm512_mask_rolv_epi32(<8 x i64> %__W, i16 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) {
9466; X86-LABEL: test_mm512_mask_rolv_epi32:
9467; X86:       # %bb.0: # %entry
9468; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
9469; X86-NEXT:    vprolvd %zmm2, %zmm1, %zmm0 {%k1}
9470; X86-NEXT:    retl
9471;
9472; X64-LABEL: test_mm512_mask_rolv_epi32:
9473; X64:       # %bb.0: # %entry
9474; X64-NEXT:    kmovw %edi, %k1
9475; X64-NEXT:    vprolvd %zmm2, %zmm1, %zmm0 {%k1}
9476; X64-NEXT:    retq
9477entry:
9478  %0 = bitcast <8 x i64> %__A to <16 x i32>
9479  %1 = bitcast <8 x i64> %__B to <16 x i32>
9480  %2 = tail call <16 x i32> @llvm.x86.avx512.prolv.d.512(<16 x i32> %0, <16 x i32> %1)
9481  %3 = bitcast <8 x i64> %__W to <16 x i32>
9482  %4 = bitcast i16 %__U to <16 x i1>
9483  %5 = select <16 x i1> %4, <16 x i32> %2, <16 x i32> %3
9484  %6 = bitcast <16 x i32> %5 to <8 x i64>
9485  ret <8 x i64> %6
9486}
9487
9488define <8 x i64> @test_mm512_maskz_rolv_epi32(i16 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) {
9489; X86-LABEL: test_mm512_maskz_rolv_epi32:
9490; X86:       # %bb.0: # %entry
9491; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
9492; X86-NEXT:    vprolvd %zmm1, %zmm0, %zmm0 {%k1} {z}
9493; X86-NEXT:    retl
9494;
9495; X64-LABEL: test_mm512_maskz_rolv_epi32:
9496; X64:       # %bb.0: # %entry
9497; X64-NEXT:    kmovw %edi, %k1
9498; X64-NEXT:    vprolvd %zmm1, %zmm0, %zmm0 {%k1} {z}
9499; X64-NEXT:    retq
9500entry:
9501  %0 = bitcast <8 x i64> %__A to <16 x i32>
9502  %1 = bitcast <8 x i64> %__B to <16 x i32>
9503  %2 = tail call <16 x i32> @llvm.x86.avx512.prolv.d.512(<16 x i32> %0, <16 x i32> %1)
9504  %3 = bitcast i16 %__U to <16 x i1>
9505  %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer
9506  %5 = bitcast <16 x i32> %4 to <8 x i64>
9507  ret <8 x i64> %5
9508}
9509
9510define <8 x i64> @test_mm512_rolv_epi64(<8 x i64> %__A, <8 x i64> %__B) {
9511; CHECK-LABEL: test_mm512_rolv_epi64:
9512; CHECK:       # %bb.0: # %entry
9513; CHECK-NEXT:    vprolvq %zmm1, %zmm0, %zmm0
9514; CHECK-NEXT:    ret{{[l|q]}}
9515entry:
9516  %0 = tail call <8 x i64> @llvm.x86.avx512.prolv.q.512(<8 x i64> %__A, <8 x i64> %__B)
9517  ret <8 x i64> %0
9518}
9519
9520define <8 x i64> @test_mm512_mask_rolv_epi64(<8 x i64> %__W, i8 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) {
9521; X86-LABEL: test_mm512_mask_rolv_epi64:
9522; X86:       # %bb.0: # %entry
9523; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
9524; X86-NEXT:    kmovw %eax, %k1
9525; X86-NEXT:    vprolvq %zmm2, %zmm1, %zmm0 {%k1}
9526; X86-NEXT:    retl
9527;
9528; X64-LABEL: test_mm512_mask_rolv_epi64:
9529; X64:       # %bb.0: # %entry
9530; X64-NEXT:    kmovw %edi, %k1
9531; X64-NEXT:    vprolvq %zmm2, %zmm1, %zmm0 {%k1}
9532; X64-NEXT:    retq
9533entry:
9534  %0 = tail call <8 x i64> @llvm.x86.avx512.prolv.q.512(<8 x i64> %__A, <8 x i64> %__B)
9535  %1 = bitcast i8 %__U to <8 x i1>
9536  %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> %__W
9537  ret <8 x i64> %2
9538}
9539
9540define <8 x i64> @test_mm512_maskz_rolv_epi64(i8 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) {
9541; X86-LABEL: test_mm512_maskz_rolv_epi64:
9542; X86:       # %bb.0: # %entry
9543; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
9544; X86-NEXT:    kmovw %eax, %k1
9545; X86-NEXT:    vprolvq %zmm1, %zmm0, %zmm0 {%k1} {z}
9546; X86-NEXT:    retl
9547;
9548; X64-LABEL: test_mm512_maskz_rolv_epi64:
9549; X64:       # %bb.0: # %entry
9550; X64-NEXT:    kmovw %edi, %k1
9551; X64-NEXT:    vprolvq %zmm1, %zmm0, %zmm0 {%k1} {z}
9552; X64-NEXT:    retq
9553entry:
9554  %0 = tail call <8 x i64> @llvm.x86.avx512.prolv.q.512(<8 x i64> %__A, <8 x i64> %__B)
9555  %1 = bitcast i8 %__U to <8 x i1>
9556  %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> zeroinitializer
9557  ret <8 x i64> %2
9558}
9559
9560define <8 x i64> @test_mm512_ror_epi32(<8 x i64> %__A) {
9561; CHECK-LABEL: test_mm512_ror_epi32:
9562; CHECK:       # %bb.0: # %entry
9563; CHECK-NEXT:    vprord $5, %zmm0, %zmm0
9564; CHECK-NEXT:    ret{{[l|q]}}
9565entry:
9566  %0 = bitcast <8 x i64> %__A to <16 x i32>
9567  %1 = tail call <16 x i32> @llvm.x86.avx512.pror.d.512(<16 x i32> %0, i32 5)
9568  %2 = bitcast <16 x i32> %1 to <8 x i64>
9569  ret <8 x i64> %2
9570}
9571
9572declare <16 x i32> @llvm.x86.avx512.pror.d.512(<16 x i32>, i32) #1
9573
9574define <8 x i64> @test_mm512_mask_ror_epi32(<8 x i64> %__W, i16 zeroext %__U, <8 x i64> %__A) {
9575; X86-LABEL: test_mm512_mask_ror_epi32:
9576; X86:       # %bb.0: # %entry
9577; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
9578; X86-NEXT:    vprord $5, %zmm1, %zmm0 {%k1}
9579; X86-NEXT:    retl
9580;
9581; X64-LABEL: test_mm512_mask_ror_epi32:
9582; X64:       # %bb.0: # %entry
9583; X64-NEXT:    kmovw %edi, %k1
9584; X64-NEXT:    vprord $5, %zmm1, %zmm0 {%k1}
9585; X64-NEXT:    retq
9586entry:
9587  %0 = bitcast <8 x i64> %__A to <16 x i32>
9588  %1 = tail call <16 x i32> @llvm.x86.avx512.pror.d.512(<16 x i32> %0, i32 5)
9589  %2 = bitcast <8 x i64> %__W to <16 x i32>
9590  %3 = bitcast i16 %__U to <16 x i1>
9591  %4 = select <16 x i1> %3, <16 x i32> %1, <16 x i32> %2
9592  %5 = bitcast <16 x i32> %4 to <8 x i64>
9593  ret <8 x i64> %5
9594}
9595
9596define <8 x i64> @test_mm512_maskz_ror_epi32(i16 zeroext %__U, <8 x i64> %__A) {
9597; X86-LABEL: test_mm512_maskz_ror_epi32:
9598; X86:       # %bb.0: # %entry
9599; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
9600; X86-NEXT:    vprord $5, %zmm0, %zmm0 {%k1} {z}
9601; X86-NEXT:    retl
9602;
9603; X64-LABEL: test_mm512_maskz_ror_epi32:
9604; X64:       # %bb.0: # %entry
9605; X64-NEXT:    kmovw %edi, %k1
9606; X64-NEXT:    vprord $5, %zmm0, %zmm0 {%k1} {z}
9607; X64-NEXT:    retq
9608entry:
9609  %0 = bitcast <8 x i64> %__A to <16 x i32>
9610  %1 = tail call <16 x i32> @llvm.x86.avx512.pror.d.512(<16 x i32> %0, i32 5)
9611  %2 = bitcast i16 %__U to <16 x i1>
9612  %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> zeroinitializer
9613  %4 = bitcast <16 x i32> %3 to <8 x i64>
9614  ret <8 x i64> %4
9615}
9616
9617define <8 x i64> @test_mm512_ror_epi64(<8 x i64> %__A) {
9618; CHECK-LABEL: test_mm512_ror_epi64:
9619; CHECK:       # %bb.0: # %entry
9620; CHECK-NEXT:    vprorq $5, %zmm0, %zmm0
9621; CHECK-NEXT:    ret{{[l|q]}}
9622entry:
9623  %0 = tail call <8 x i64> @llvm.x86.avx512.pror.q.512(<8 x i64> %__A, i32 5)
9624  ret <8 x i64> %0
9625}
9626
9627declare <8 x i64> @llvm.x86.avx512.pror.q.512(<8 x i64>, i32) #1
9628
9629define <8 x i64> @test_mm512_mask_ror_epi64(<8 x i64> %__W, i8 zeroext %__U, <8 x i64> %__A) {
9630; X86-LABEL: test_mm512_mask_ror_epi64:
9631; X86:       # %bb.0: # %entry
9632; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
9633; X86-NEXT:    kmovw %eax, %k1
9634; X86-NEXT:    vprorq $5, %zmm1, %zmm0 {%k1}
9635; X86-NEXT:    retl
9636;
9637; X64-LABEL: test_mm512_mask_ror_epi64:
9638; X64:       # %bb.0: # %entry
9639; X64-NEXT:    kmovw %edi, %k1
9640; X64-NEXT:    vprorq $5, %zmm1, %zmm0 {%k1}
9641; X64-NEXT:    retq
9642entry:
9643  %0 = tail call <8 x i64> @llvm.x86.avx512.pror.q.512(<8 x i64> %__A, i32 5)
9644  %1 = bitcast i8 %__U to <8 x i1>
9645  %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> %__W
9646  ret <8 x i64> %2
9647}
9648
9649define <8 x i64> @test_mm512_maskz_ror_epi64(i8 zeroext %__U, <8 x i64> %__A) {
9650; X86-LABEL: test_mm512_maskz_ror_epi64:
9651; X86:       # %bb.0: # %entry
9652; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
9653; X86-NEXT:    kmovw %eax, %k1
9654; X86-NEXT:    vprorq $5, %zmm0, %zmm0 {%k1} {z}
9655; X86-NEXT:    retl
9656;
9657; X64-LABEL: test_mm512_maskz_ror_epi64:
9658; X64:       # %bb.0: # %entry
9659; X64-NEXT:    kmovw %edi, %k1
9660; X64-NEXT:    vprorq $5, %zmm0, %zmm0 {%k1} {z}
9661; X64-NEXT:    retq
9662entry:
9663  %0 = tail call <8 x i64> @llvm.x86.avx512.pror.q.512(<8 x i64> %__A, i32 5)
9664  %1 = bitcast i8 %__U to <8 x i1>
9665  %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> zeroinitializer
9666  ret <8 x i64> %2
9667}
9668
9669define <8 x i64> @test_mm512_rorv_epi32(<8 x i64> %__A, <8 x i64> %__B) {
9670; CHECK-LABEL: test_mm512_rorv_epi32:
9671; CHECK:       # %bb.0: # %entry
9672; CHECK-NEXT:    vprorvd %zmm1, %zmm0, %zmm0
9673; CHECK-NEXT:    ret{{[l|q]}}
9674entry:
9675  %0 = bitcast <8 x i64> %__A to <16 x i32>
9676  %1 = bitcast <8 x i64> %__B to <16 x i32>
9677  %2 = tail call <16 x i32> @llvm.x86.avx512.prorv.d.512(<16 x i32> %0, <16 x i32> %1)
9678  %3 = bitcast <16 x i32> %2 to <8 x i64>
9679  ret <8 x i64> %3
9680}
9681
9682define <8 x i64> @test_mm512_mask_rorv_epi32(<8 x i64> %__W, i16 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) {
9683; X86-LABEL: test_mm512_mask_rorv_epi32:
9684; X86:       # %bb.0: # %entry
9685; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
9686; X86-NEXT:    vprorvd %zmm2, %zmm1, %zmm0 {%k1}
9687; X86-NEXT:    retl
9688;
9689; X64-LABEL: test_mm512_mask_rorv_epi32:
9690; X64:       # %bb.0: # %entry
9691; X64-NEXT:    kmovw %edi, %k1
9692; X64-NEXT:    vprorvd %zmm2, %zmm1, %zmm0 {%k1}
9693; X64-NEXT:    retq
9694entry:
9695  %0 = bitcast <8 x i64> %__A to <16 x i32>
9696  %1 = bitcast <8 x i64> %__B to <16 x i32>
9697  %2 = tail call <16 x i32> @llvm.x86.avx512.prorv.d.512(<16 x i32> %0, <16 x i32> %1)
9698  %3 = bitcast <8 x i64> %__W to <16 x i32>
9699  %4 = bitcast i16 %__U to <16 x i1>
9700  %5 = select <16 x i1> %4, <16 x i32> %2, <16 x i32> %3
9701  %6 = bitcast <16 x i32> %5 to <8 x i64>
9702  ret <8 x i64> %6
9703}
9704
9705define <8 x i64> @test_mm512_maskz_rorv_epi32(i16 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) {
9706; X86-LABEL: test_mm512_maskz_rorv_epi32:
9707; X86:       # %bb.0: # %entry
9708; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
9709; X86-NEXT:    vprorvd %zmm1, %zmm0, %zmm0 {%k1} {z}
9710; X86-NEXT:    retl
9711;
9712; X64-LABEL: test_mm512_maskz_rorv_epi32:
9713; X64:       # %bb.0: # %entry
9714; X64-NEXT:    kmovw %edi, %k1
9715; X64-NEXT:    vprorvd %zmm1, %zmm0, %zmm0 {%k1} {z}
9716; X64-NEXT:    retq
9717entry:
9718  %0 = bitcast <8 x i64> %__A to <16 x i32>
9719  %1 = bitcast <8 x i64> %__B to <16 x i32>
9720  %2 = tail call <16 x i32> @llvm.x86.avx512.prorv.d.512(<16 x i32> %0, <16 x i32> %1)
9721  %3 = bitcast i16 %__U to <16 x i1>
9722  %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer
9723  %5 = bitcast <16 x i32> %4 to <8 x i64>
9724  ret <8 x i64> %5
9725}
9726
9727define <8 x i64> @test_mm512_rorv_epi64(<8 x i64> %__A, <8 x i64> %__B) {
9728; CHECK-LABEL: test_mm512_rorv_epi64:
9729; CHECK:       # %bb.0: # %entry
9730; CHECK-NEXT:    vprorvq %zmm1, %zmm0, %zmm0
9731; CHECK-NEXT:    ret{{[l|q]}}
9732entry:
9733  %0 = tail call <8 x i64> @llvm.x86.avx512.prorv.q.512(<8 x i64> %__A, <8 x i64> %__B)
9734  ret <8 x i64> %0
9735}
9736
9737define <8 x i64> @test_mm512_mask_rorv_epi64(<8 x i64> %__W, i8 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) {
9738; X86-LABEL: test_mm512_mask_rorv_epi64:
9739; X86:       # %bb.0: # %entry
9740; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
9741; X86-NEXT:    kmovw %eax, %k1
9742; X86-NEXT:    vprorvq %zmm2, %zmm1, %zmm0 {%k1}
9743; X86-NEXT:    retl
9744;
9745; X64-LABEL: test_mm512_mask_rorv_epi64:
9746; X64:       # %bb.0: # %entry
9747; X64-NEXT:    kmovw %edi, %k1
9748; X64-NEXT:    vprorvq %zmm2, %zmm1, %zmm0 {%k1}
9749; X64-NEXT:    retq
9750entry:
9751  %0 = tail call <8 x i64> @llvm.x86.avx512.prorv.q.512(<8 x i64> %__A, <8 x i64> %__B)
9752  %1 = bitcast i8 %__U to <8 x i1>
9753  %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> %__W
9754  ret <8 x i64> %2
9755}
9756
9757define <8 x i64> @test_mm512_maskz_rorv_epi64(i8 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) {
9758; X86-LABEL: test_mm512_maskz_rorv_epi64:
9759; X86:       # %bb.0: # %entry
9760; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
9761; X86-NEXT:    kmovw %eax, %k1
9762; X86-NEXT:    vprorvq %zmm1, %zmm0, %zmm0 {%k1} {z}
9763; X86-NEXT:    retl
9764;
9765; X64-LABEL: test_mm512_maskz_rorv_epi64:
9766; X64:       # %bb.0: # %entry
9767; X64-NEXT:    kmovw %edi, %k1
9768; X64-NEXT:    vprorvq %zmm1, %zmm0, %zmm0 {%k1} {z}
9769; X64-NEXT:    retq
9770entry:
9771  %0 = tail call <8 x i64> @llvm.x86.avx512.prorv.q.512(<8 x i64> %__A, <8 x i64> %__B)
9772  %1 = bitcast i8 %__U to <8 x i1>
9773  %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> zeroinitializer
9774  ret <8 x i64> %2
9775}
9776
9777declare <8 x double> @llvm.fma.v8f64(<8 x double>, <8 x double>, <8 x double>) #9
9778declare <16 x float> @llvm.fma.v16f32(<16 x float>, <16 x float>, <16 x float>) #9
9779declare float @llvm.fma.f32(float, float, float) #9
9780declare double @llvm.fma.f64(double, double, double) #9
9781declare <8 x i64> @llvm.masked.expandload.v8i64(i64*, <8 x i1>, <8 x i64>)
9782declare <8 x double> @llvm.masked.expandload.v8f64(double*, <8 x i1>, <8 x double>)
9783declare <16 x i32> @llvm.masked.expandload.v16i32(i32*, <16 x i1>, <16 x i32>) #10
9784declare <16 x float> @llvm.masked.expandload.v16f32(float*, <16 x i1>, <16 x float>)
9785declare void @llvm.masked.compressstore.v8f64(<8 x double>, double*, <8 x i1>)
9786declare void @llvm.masked.compressstore.v8i64(<8 x i64>, i64*, <8 x i1>)
9787declare void @llvm.masked.compressstore.v16f32(<16 x float>, float*, <16 x i1>)
9788declare void @llvm.masked.compressstore.v16i32(<16 x i32>, i32*, <16 x i1>)
9789declare <4 x double> @llvm.x86.avx.max.pd.256(<4 x double>, <4 x double>)
9790declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>)
9791declare <4 x double> @llvm.x86.avx.min.pd.256(<4 x double>, <4 x double>)
9792declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>)
9793declare <8 x float> @llvm.x86.avx.max.ps.256(<8 x float>, <8 x float>)
9794declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>)
9795declare <8 x float> @llvm.x86.avx.min.ps.256(<8 x float>, <8 x float>)
9796declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>)
9797declare <8 x double> @llvm.sqrt.v8f64(<8 x double>)
9798declare <16 x float> @llvm.sqrt.v16f32(<16 x float>)
9799declare <16 x i32> @llvm.x86.avx512.prolv.d.512(<16 x i32>, <16 x i32>)
9800declare <8 x i64> @llvm.x86.avx512.prolv.q.512(<8 x i64>, <8 x i64>)
9801declare <16 x i32> @llvm.x86.avx512.prorv.d.512(<16 x i32>, <16 x i32>)
9802declare <8 x i64> @llvm.x86.avx512.prorv.q.512(<8 x i64>, <8 x i64>)
9803
9804!0 = !{i32 1}
9805
9806