1; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw -mattr=+avx512vl --show-mc-encoding| FileCheck %s
2
3; 256-bit
4
5define i32 @test_pcmpeq_b_256(<32 x i8> %a, <32 x i8> %b) {
6; CHECK-LABEL: test_pcmpeq_b_256
7; CHECK: vpcmpeqb %ymm1, %ymm0, %k0 ##
8  %res = call i32 @llvm.x86.avx512.mask.pcmpeq.b.256(<32 x i8> %a, <32 x i8> %b, i32 -1)
9  ret i32 %res
10}
11
12define i32 @test_mask_pcmpeq_b_256(<32 x i8> %a, <32 x i8> %b, i32 %mask) {
13; CHECK-LABEL: test_mask_pcmpeq_b_256
14; CHECK: vpcmpeqb %ymm1, %ymm0, %k0 {%k1} ##
15  %res = call i32 @llvm.x86.avx512.mask.pcmpeq.b.256(<32 x i8> %a, <32 x i8> %b, i32 %mask)
16  ret i32 %res
17}
18
19declare i32 @llvm.x86.avx512.mask.pcmpeq.b.256(<32 x i8>, <32 x i8>, i32)
20
21define i16 @test_pcmpeq_w_256(<16 x i16> %a, <16 x i16> %b) {
22; CHECK-LABEL: test_pcmpeq_w_256
23; CHECK: vpcmpeqw %ymm1, %ymm0, %k0 ##
24  %res = call i16 @llvm.x86.avx512.mask.pcmpeq.w.256(<16 x i16> %a, <16 x i16> %b, i16 -1)
25  ret i16 %res
26}
27
28define i16 @test_mask_pcmpeq_w_256(<16 x i16> %a, <16 x i16> %b, i16 %mask) {
29; CHECK-LABEL: test_mask_pcmpeq_w_256
30; CHECK: vpcmpeqw %ymm1, %ymm0, %k0 {%k1} ##
31  %res = call i16 @llvm.x86.avx512.mask.pcmpeq.w.256(<16 x i16> %a, <16 x i16> %b, i16 %mask)
32  ret i16 %res
33}
34
35declare i16 @llvm.x86.avx512.mask.pcmpeq.w.256(<16 x i16>, <16 x i16>, i16)
36
37define i32 @test_pcmpgt_b_256(<32 x i8> %a, <32 x i8> %b) {
38; CHECK-LABEL: test_pcmpgt_b_256
39; CHECK: vpcmpgtb %ymm1, %ymm0, %k0 ##
40  %res = call i32 @llvm.x86.avx512.mask.pcmpgt.b.256(<32 x i8> %a, <32 x i8> %b, i32 -1)
41  ret i32 %res
42}
43
44define i32 @test_mask_pcmpgt_b_256(<32 x i8> %a, <32 x i8> %b, i32 %mask) {
45; CHECK-LABEL: test_mask_pcmpgt_b_256
46; CHECK: vpcmpgtb %ymm1, %ymm0, %k0 {%k1} ##
47  %res = call i32 @llvm.x86.avx512.mask.pcmpgt.b.256(<32 x i8> %a, <32 x i8> %b, i32 %mask)
48  ret i32 %res
49}
50
51declare i32 @llvm.x86.avx512.mask.pcmpgt.b.256(<32 x i8>, <32 x i8>, i32)
52
53define i16 @test_pcmpgt_w_256(<16 x i16> %a, <16 x i16> %b) {
54; CHECK-LABEL: test_pcmpgt_w_256
55; CHECK: vpcmpgtw %ymm1, %ymm0, %k0 ##
56  %res = call i16 @llvm.x86.avx512.mask.pcmpgt.w.256(<16 x i16> %a, <16 x i16> %b, i16 -1)
57  ret i16 %res
58}
59
60define i16 @test_mask_pcmpgt_w_256(<16 x i16> %a, <16 x i16> %b, i16 %mask) {
61; CHECK-LABEL: test_mask_pcmpgt_w_256
62; CHECK: vpcmpgtw %ymm1, %ymm0, %k0 {%k1} ##
63  %res = call i16 @llvm.x86.avx512.mask.pcmpgt.w.256(<16 x i16> %a, <16 x i16> %b, i16 %mask)
64  ret i16 %res
65}
66
67declare i16 @llvm.x86.avx512.mask.pcmpgt.w.256(<16 x i16>, <16 x i16>, i16)
68
69define <8 x i32> @test_cmp_b_256(<32 x i8> %a0, <32 x i8> %a1) {
70; CHECK_LABEL: test_cmp_b_256
71; CHECK: vpcmpeqb %ymm1, %ymm0, %k0 ##
72  %res0 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 0, i32 -1)
73  %vec0 = insertelement <8 x i32> undef, i32 %res0, i32 0
74; CHECK: vpcmpltb %ymm1, %ymm0, %k0 ##
75  %res1 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 1, i32 -1)
76  %vec1 = insertelement <8 x i32> %vec0, i32 %res1, i32 1
77; CHECK: vpcmpleb %ymm1, %ymm0, %k0 ##
78  %res2 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 2, i32 -1)
79  %vec2 = insertelement <8 x i32> %vec1, i32 %res2, i32 2
80; CHECK: vpcmpunordb %ymm1, %ymm0, %k0 ##
81  %res3 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 3, i32 -1)
82  %vec3 = insertelement <8 x i32> %vec2, i32 %res3, i32 3
83; CHECK: vpcmpneqb %ymm1, %ymm0, %k0 ##
84  %res4 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 4, i32 -1)
85  %vec4 = insertelement <8 x i32> %vec3, i32 %res4, i32 4
86; CHECK: vpcmpnltb %ymm1, %ymm0, %k0 ##
87  %res5 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 5, i32 -1)
88  %vec5 = insertelement <8 x i32> %vec4, i32 %res5, i32 5
89; CHECK: vpcmpnleb %ymm1, %ymm0, %k0 ##
90  %res6 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 6, i32 -1)
91  %vec6 = insertelement <8 x i32> %vec5, i32 %res6, i32 6
92; CHECK: vpcmpordb %ymm1, %ymm0, %k0 ##
93  %res7 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 7, i32 -1)
94  %vec7 = insertelement <8 x i32> %vec6, i32 %res7, i32 7
95  ret <8 x i32> %vec7
96}
97
98define <8 x i32> @test_mask_cmp_b_256(<32 x i8> %a0, <32 x i8> %a1, i32 %mask) {
99; CHECK_LABEL: test_mask_cmp_b_256
100; CHECK: vpcmpeqb %ymm1, %ymm0, %k0 {%k1} ##
101  %res0 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 0, i32 %mask)
102  %vec0 = insertelement <8 x i32> undef, i32 %res0, i32 0
103; CHECK: vpcmpltb %ymm1, %ymm0, %k0 {%k1} ##
104  %res1 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 1, i32 %mask)
105  %vec1 = insertelement <8 x i32> %vec0, i32 %res1, i32 1
106; CHECK: vpcmpleb %ymm1, %ymm0, %k0 {%k1} ##
107  %res2 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 2, i32 %mask)
108  %vec2 = insertelement <8 x i32> %vec1, i32 %res2, i32 2
109; CHECK: vpcmpunordb %ymm1, %ymm0, %k0 {%k1} ##
110  %res3 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 3, i32 %mask)
111  %vec3 = insertelement <8 x i32> %vec2, i32 %res3, i32 3
112; CHECK: vpcmpneqb %ymm1, %ymm0, %k0 {%k1} ##
113  %res4 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 4, i32 %mask)
114  %vec4 = insertelement <8 x i32> %vec3, i32 %res4, i32 4
115; CHECK: vpcmpnltb %ymm1, %ymm0, %k0 {%k1} ##
116  %res5 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 5, i32 %mask)
117  %vec5 = insertelement <8 x i32> %vec4, i32 %res5, i32 5
118; CHECK: vpcmpnleb %ymm1, %ymm0, %k0 {%k1} ##
119  %res6 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 6, i32 %mask)
120  %vec6 = insertelement <8 x i32> %vec5, i32 %res6, i32 6
121; CHECK: vpcmpordb %ymm1, %ymm0, %k0 {%k1} ##
122  %res7 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 7, i32 %mask)
123  %vec7 = insertelement <8 x i32> %vec6, i32 %res7, i32 7
124  ret <8 x i32> %vec7
125}
126
127declare i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8>, <32 x i8>, i32, i32) nounwind readnone
128
129define <8 x i32> @test_ucmp_b_256(<32 x i8> %a0, <32 x i8> %a1) {
130; CHECK_LABEL: test_ucmp_b_256
131; CHECK: vpcmpequb %ymm1, %ymm0, %k0 ##
132  %res0 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 0, i32 -1)
133  %vec0 = insertelement <8 x i32> undef, i32 %res0, i32 0
134; CHECK: vpcmpltub %ymm1, %ymm0, %k0 ##
135  %res1 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 1, i32 -1)
136  %vec1 = insertelement <8 x i32> %vec0, i32 %res1, i32 1
137; CHECK: vpcmpleub %ymm1, %ymm0, %k0 ##
138  %res2 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 2, i32 -1)
139  %vec2 = insertelement <8 x i32> %vec1, i32 %res2, i32 2
140; CHECK: vpcmpunordub %ymm1, %ymm0, %k0 ##
141  %res3 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 3, i32 -1)
142  %vec3 = insertelement <8 x i32> %vec2, i32 %res3, i32 3
143; CHECK: vpcmpnequb %ymm1, %ymm0, %k0 ##
144  %res4 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 4, i32 -1)
145  %vec4 = insertelement <8 x i32> %vec3, i32 %res4, i32 4
146; CHECK: vpcmpnltub %ymm1, %ymm0, %k0 ##
147  %res5 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 5, i32 -1)
148  %vec5 = insertelement <8 x i32> %vec4, i32 %res5, i32 5
149; CHECK: vpcmpnleub %ymm1, %ymm0, %k0 ##
150  %res6 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 6, i32 -1)
151  %vec6 = insertelement <8 x i32> %vec5, i32 %res6, i32 6
152; CHECK: vpcmpordub %ymm1, %ymm0, %k0 ##
153  %res7 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 7, i32 -1)
154  %vec7 = insertelement <8 x i32> %vec6, i32 %res7, i32 7
155  ret <8 x i32> %vec7
156}
157
158define <8 x i32> @test_mask_ucmp_b_256(<32 x i8> %a0, <32 x i8> %a1, i32 %mask) {
159; CHECK_LABEL: test_mask_ucmp_b_256
160; CHECK: vpcmpequb %ymm1, %ymm0, %k0 {%k1} ##
161  %res0 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 0, i32 %mask)
162  %vec0 = insertelement <8 x i32> undef, i32 %res0, i32 0
163; CHECK: vpcmpltub %ymm1, %ymm0, %k0 {%k1} ##
164  %res1 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 1, i32 %mask)
165  %vec1 = insertelement <8 x i32> %vec0, i32 %res1, i32 1
166; CHECK: vpcmpleub %ymm1, %ymm0, %k0 {%k1} ##
167  %res2 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 2, i32 %mask)
168  %vec2 = insertelement <8 x i32> %vec1, i32 %res2, i32 2
169; CHECK: vpcmpunordub %ymm1, %ymm0, %k0 {%k1} ##
170  %res3 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 3, i32 %mask)
171  %vec3 = insertelement <8 x i32> %vec2, i32 %res3, i32 3
172; CHECK: vpcmpnequb %ymm1, %ymm0, %k0 {%k1} ##
173  %res4 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 4, i32 %mask)
174  %vec4 = insertelement <8 x i32> %vec3, i32 %res4, i32 4
175; CHECK: vpcmpnltub %ymm1, %ymm0, %k0 {%k1} ##
176  %res5 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 5, i32 %mask)
177  %vec5 = insertelement <8 x i32> %vec4, i32 %res5, i32 5
178; CHECK: vpcmpnleub %ymm1, %ymm0, %k0 {%k1} ##
179  %res6 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 6, i32 %mask)
180  %vec6 = insertelement <8 x i32> %vec5, i32 %res6, i32 6
181; CHECK: vpcmpordub %ymm1, %ymm0, %k0 {%k1} ##
182  %res7 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 7, i32 %mask)
183  %vec7 = insertelement <8 x i32> %vec6, i32 %res7, i32 7
184  ret <8 x i32> %vec7
185}
186
187declare i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8>, <32 x i8>, i32, i32) nounwind readnone
188
189define <8 x i16> @test_cmp_w_256(<16 x i16> %a0, <16 x i16> %a1) {
190; CHECK_LABEL: test_cmp_w_256
191; CHECK: vpcmpeqw %ymm1, %ymm0, %k0 ##
192  %res0 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 0, i16 -1)
193  %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
194; CHECK: vpcmpltw %ymm1, %ymm0, %k0 ##
195  %res1 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 1, i16 -1)
196  %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1
197; CHECK: vpcmplew %ymm1, %ymm0, %k0 ##
198  %res2 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 2, i16 -1)
199  %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2
200; CHECK: vpcmpunordw %ymm1, %ymm0, %k0 ##
201  %res3 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 3, i16 -1)
202  %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3
203; CHECK: vpcmpneqw %ymm1, %ymm0, %k0 ##
204  %res4 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 4, i16 -1)
205  %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4
206; CHECK: vpcmpnltw %ymm1, %ymm0, %k0 ##
207  %res5 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 5, i16 -1)
208  %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5
209; CHECK: vpcmpnlew %ymm1, %ymm0, %k0 ##
210  %res6 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 6, i16 -1)
211  %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6
212; CHECK: vpcmpordw %ymm1, %ymm0, %k0 ##
213  %res7 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 7, i16 -1)
214  %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7
215  ret <8 x i16> %vec7
216}
217
218define <8 x i16> @test_mask_cmp_w_256(<16 x i16> %a0, <16 x i16> %a1, i16 %mask) {
219; CHECK_LABEL: test_mask_cmp_w_256
220; CHECK: vpcmpeqw %ymm1, %ymm0, %k0 {%k1} ##
221  %res0 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 0, i16 %mask)
222  %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
223; CHECK: vpcmpltw %ymm1, %ymm0, %k0 {%k1} ##
224  %res1 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 1, i16 %mask)
225  %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1
226; CHECK: vpcmplew %ymm1, %ymm0, %k0 {%k1} ##
227  %res2 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 2, i16 %mask)
228  %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2
229; CHECK: vpcmpunordw %ymm1, %ymm0, %k0 {%k1} ##
230  %res3 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 3, i16 %mask)
231  %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3
232; CHECK: vpcmpneqw %ymm1, %ymm0, %k0 {%k1} ##
233  %res4 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 4, i16 %mask)
234  %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4
235; CHECK: vpcmpnltw %ymm1, %ymm0, %k0 {%k1} ##
236  %res5 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 5, i16 %mask)
237  %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5
238; CHECK: vpcmpnlew %ymm1, %ymm0, %k0 {%k1} ##
239  %res6 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 6, i16 %mask)
240  %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6
241; CHECK: vpcmpordw %ymm1, %ymm0, %k0 {%k1} ##
242  %res7 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 7, i16 %mask)
243  %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7
244  ret <8 x i16> %vec7
245}
246
247declare i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16>, <16 x i16>, i32, i16) nounwind readnone
248
249define <8 x i16> @test_ucmp_w_256(<16 x i16> %a0, <16 x i16> %a1) {
250; CHECK_LABEL: test_ucmp_w_256
251; CHECK: vpcmpequw %ymm1, %ymm0, %k0 ##
252  %res0 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 0, i16 -1)
253  %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
254; CHECK: vpcmpltuw %ymm1, %ymm0, %k0 ##
255  %res1 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 1, i16 -1)
256  %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1
257; CHECK: vpcmpleuw %ymm1, %ymm0, %k0 ##
258  %res2 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 2, i16 -1)
259  %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2
260; CHECK: vpcmpunorduw %ymm1, %ymm0, %k0 ##
261  %res3 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 3, i16 -1)
262  %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3
263; CHECK: vpcmpnequw %ymm1, %ymm0, %k0 ##
264  %res4 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 4, i16 -1)
265  %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4
266; CHECK: vpcmpnltuw %ymm1, %ymm0, %k0 ##
267  %res5 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 5, i16 -1)
268  %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5
269; CHECK: vpcmpnleuw %ymm1, %ymm0, %k0 ##
270  %res6 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 6, i16 -1)
271  %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6
272; CHECK: vpcmporduw %ymm1, %ymm0, %k0 ##
273  %res7 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 7, i16 -1)
274  %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7
275  ret <8 x i16> %vec7
276}
277
278define <8 x i16> @test_mask_ucmp_w_256(<16 x i16> %a0, <16 x i16> %a1, i16 %mask) {
279; CHECK_LABEL: test_mask_ucmp_w_256
280; CHECK: vpcmpequw %ymm1, %ymm0, %k0 {%k1} ##
281  %res0 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 0, i16 %mask)
282  %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
283; CHECK: vpcmpltuw %ymm1, %ymm0, %k0 {%k1} ##
284  %res1 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 1, i16 %mask)
285  %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1
286; CHECK: vpcmpleuw %ymm1, %ymm0, %k0 {%k1} ##
287  %res2 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 2, i16 %mask)
288  %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2
289; CHECK: vpcmpunorduw %ymm1, %ymm0, %k0 {%k1} ##
290  %res3 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 3, i16 %mask)
291  %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3
292; CHECK: vpcmpnequw %ymm1, %ymm0, %k0 {%k1} ##
293  %res4 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 4, i16 %mask)
294  %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4
295; CHECK: vpcmpnltuw %ymm1, %ymm0, %k0 {%k1} ##
296  %res5 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 5, i16 %mask)
297  %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5
298; CHECK: vpcmpnleuw %ymm1, %ymm0, %k0 {%k1} ##
299  %res6 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 6, i16 %mask)
300  %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6
301; CHECK: vpcmporduw %ymm1, %ymm0, %k0 {%k1} ##
302  %res7 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 7, i16 %mask)
303  %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7
304  ret <8 x i16> %vec7
305}
306
307declare i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16>, <16 x i16>, i32, i16) nounwind readnone
308
309; 128-bit
310
311define i16 @test_pcmpeq_b_128(<16 x i8> %a, <16 x i8> %b) {
312; CHECK-LABEL: test_pcmpeq_b_128
313; CHECK: vpcmpeqb %xmm1, %xmm0, %k0 ##
314  %res = call i16 @llvm.x86.avx512.mask.pcmpeq.b.128(<16 x i8> %a, <16 x i8> %b, i16 -1)
315  ret i16 %res
316}
317
318define i16 @test_mask_pcmpeq_b_128(<16 x i8> %a, <16 x i8> %b, i16 %mask) {
319; CHECK-LABEL: test_mask_pcmpeq_b_128
320; CHECK: vpcmpeqb %xmm1, %xmm0, %k0 {%k1} ##
321  %res = call i16 @llvm.x86.avx512.mask.pcmpeq.b.128(<16 x i8> %a, <16 x i8> %b, i16 %mask)
322  ret i16 %res
323}
324
325declare i16 @llvm.x86.avx512.mask.pcmpeq.b.128(<16 x i8>, <16 x i8>, i16)
326
327define i8 @test_pcmpeq_w_128(<8 x i16> %a, <8 x i16> %b) {
328; CHECK-LABEL: test_pcmpeq_w_128
329; CHECK: vpcmpeqw %xmm1, %xmm0, %k0 ##
330  %res = call i8 @llvm.x86.avx512.mask.pcmpeq.w.128(<8 x i16> %a, <8 x i16> %b, i8 -1)
331  ret i8 %res
332}
333
334define i8 @test_mask_pcmpeq_w_128(<8 x i16> %a, <8 x i16> %b, i8 %mask) {
335; CHECK-LABEL: test_mask_pcmpeq_w_128
336; CHECK: vpcmpeqw %xmm1, %xmm0, %k0 {%k1} ##
337  %res = call i8 @llvm.x86.avx512.mask.pcmpeq.w.128(<8 x i16> %a, <8 x i16> %b, i8 %mask)
338  ret i8 %res
339}
340
341declare i8 @llvm.x86.avx512.mask.pcmpeq.w.128(<8 x i16>, <8 x i16>, i8)
342
343define i16 @test_pcmpgt_b_128(<16 x i8> %a, <16 x i8> %b) {
344; CHECK-LABEL: test_pcmpgt_b_128
345; CHECK: vpcmpgtb %xmm1, %xmm0, %k0 ##
346  %res = call i16 @llvm.x86.avx512.mask.pcmpgt.b.128(<16 x i8> %a, <16 x i8> %b, i16 -1)
347  ret i16 %res
348}
349
350define i16 @test_mask_pcmpgt_b_128(<16 x i8> %a, <16 x i8> %b, i16 %mask) {
351; CHECK-LABEL: test_mask_pcmpgt_b_128
352; CHECK: vpcmpgtb %xmm1, %xmm0, %k0 {%k1} ##
353  %res = call i16 @llvm.x86.avx512.mask.pcmpgt.b.128(<16 x i8> %a, <16 x i8> %b, i16 %mask)
354  ret i16 %res
355}
356
357declare i16 @llvm.x86.avx512.mask.pcmpgt.b.128(<16 x i8>, <16 x i8>, i16)
358
359define i8 @test_pcmpgt_w_128(<8 x i16> %a, <8 x i16> %b) {
360; CHECK-LABEL: test_pcmpgt_w_128
361; CHECK: vpcmpgtw %xmm1, %xmm0, %k0 ##
362  %res = call i8 @llvm.x86.avx512.mask.pcmpgt.w.128(<8 x i16> %a, <8 x i16> %b, i8 -1)
363  ret i8 %res
364}
365
366define i8 @test_mask_pcmpgt_w_128(<8 x i16> %a, <8 x i16> %b, i8 %mask) {
367; CHECK-LABEL: test_mask_pcmpgt_w_128
368; CHECK: vpcmpgtw %xmm1, %xmm0, %k0 {%k1} ##
369  %res = call i8 @llvm.x86.avx512.mask.pcmpgt.w.128(<8 x i16> %a, <8 x i16> %b, i8 %mask)
370  ret i8 %res
371}
372
373declare i8 @llvm.x86.avx512.mask.pcmpgt.w.128(<8 x i16>, <8 x i16>, i8)
374
375define <8 x i16> @test_cmp_b_128(<16 x i8> %a0, <16 x i8> %a1) {
376; CHECK_LABEL: test_cmp_b_128
377; CHECK: vpcmpeqb %xmm1, %xmm0, %k0 ##
378  %res0 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 0, i16 -1)
379  %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
380; CHECK: vpcmpltb %xmm1, %xmm0, %k0 ##
381  %res1 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 1, i16 -1)
382  %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1
383; CHECK: vpcmpleb %xmm1, %xmm0, %k0 ##
384  %res2 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 2, i16 -1)
385  %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2
386; CHECK: vpcmpunordb %xmm1, %xmm0, %k0 ##
387  %res3 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 3, i16 -1)
388  %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3
389; CHECK: vpcmpneqb %xmm1, %xmm0, %k0 ##
390  %res4 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 4, i16 -1)
391  %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4
392; CHECK: vpcmpnltb %xmm1, %xmm0, %k0 ##
393  %res5 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 5, i16 -1)
394  %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5
395; CHECK: vpcmpnleb %xmm1, %xmm0, %k0 ##
396  %res6 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 6, i16 -1)
397  %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6
398; CHECK: vpcmpordb %xmm1, %xmm0, %k0 ##
399  %res7 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 7, i16 -1)
400  %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7
401  ret <8 x i16> %vec7
402}
403
404define <8 x i16> @test_mask_cmp_b_128(<16 x i8> %a0, <16 x i8> %a1, i16 %mask) {
405; CHECK_LABEL: test_mask_cmp_b_128
406; CHECK: vpcmpeqb %xmm1, %xmm0, %k0 {%k1} ##
407  %res0 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 0, i16 %mask)
408  %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
409; CHECK: vpcmpltb %xmm1, %xmm0, %k0 {%k1} ##
410  %res1 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 1, i16 %mask)
411  %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1
412; CHECK: vpcmpleb %xmm1, %xmm0, %k0 {%k1} ##
413  %res2 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 2, i16 %mask)
414  %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2
415; CHECK: vpcmpunordb %xmm1, %xmm0, %k0 {%k1} ##
416  %res3 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 3, i16 %mask)
417  %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3
418; CHECK: vpcmpneqb %xmm1, %xmm0, %k0 {%k1} ##
419  %res4 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 4, i16 %mask)
420  %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4
421; CHECK: vpcmpnltb %xmm1, %xmm0, %k0 {%k1} ##
422  %res5 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 5, i16 %mask)
423  %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5
424; CHECK: vpcmpnleb %xmm1, %xmm0, %k0 {%k1} ##
425  %res6 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 6, i16 %mask)
426  %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6
427; CHECK: vpcmpordb %xmm1, %xmm0, %k0 {%k1} ##
428  %res7 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 7, i16 %mask)
429  %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7
430  ret <8 x i16> %vec7
431}
432
433declare i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8>, <16 x i8>, i32, i16) nounwind readnone
434
435define <8 x i16> @test_ucmp_b_128(<16 x i8> %a0, <16 x i8> %a1) {
436; CHECK_LABEL: test_ucmp_b_128
437; CHECK: vpcmpequb %xmm1, %xmm0, %k0 ##
438  %res0 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 0, i16 -1)
439  %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
440; CHECK: vpcmpltub %xmm1, %xmm0, %k0 ##
441  %res1 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 1, i16 -1)
442  %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1
443; CHECK: vpcmpleub %xmm1, %xmm0, %k0 ##
444  %res2 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 2, i16 -1)
445  %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2
446; CHECK: vpcmpunordub %xmm1, %xmm0, %k0 ##
447  %res3 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 3, i16 -1)
448  %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3
449; CHECK: vpcmpnequb %xmm1, %xmm0, %k0 ##
450  %res4 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 4, i16 -1)
451  %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4
452; CHECK: vpcmpnltub %xmm1, %xmm0, %k0 ##
453  %res5 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 5, i16 -1)
454  %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5
455; CHECK: vpcmpnleub %xmm1, %xmm0, %k0 ##
456  %res6 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 6, i16 -1)
457  %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6
458; CHECK: vpcmpordub %xmm1, %xmm0, %k0 ##
459  %res7 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 7, i16 -1)
460  %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7
461  ret <8 x i16> %vec7
462}
463
464define <8 x i16> @test_mask_ucmp_b_128(<16 x i8> %a0, <16 x i8> %a1, i16 %mask) {
465; CHECK_LABEL: test_mask_ucmp_b_128
466; CHECK: vpcmpequb %xmm1, %xmm0, %k0 {%k1} ##
467  %res0 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 0, i16 %mask)
468  %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
469; CHECK: vpcmpltub %xmm1, %xmm0, %k0 {%k1} ##
470  %res1 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 1, i16 %mask)
471  %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1
472; CHECK: vpcmpleub %xmm1, %xmm0, %k0 {%k1} ##
473  %res2 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 2, i16 %mask)
474  %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2
475; CHECK: vpcmpunordub %xmm1, %xmm0, %k0 {%k1} ##
476  %res3 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 3, i16 %mask)
477  %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3
478; CHECK: vpcmpnequb %xmm1, %xmm0, %k0 {%k1} ##
479  %res4 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 4, i16 %mask)
480  %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4
481; CHECK: vpcmpnltub %xmm1, %xmm0, %k0 {%k1} ##
482  %res5 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 5, i16 %mask)
483  %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5
484; CHECK: vpcmpnleub %xmm1, %xmm0, %k0 {%k1} ##
485  %res6 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 6, i16 %mask)
486  %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6
487; CHECK: vpcmpordub %xmm1, %xmm0, %k0 {%k1} ##
488  %res7 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 7, i16 %mask)
489  %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7
490  ret <8 x i16> %vec7
491}
492
493declare i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8>, <16 x i8>, i32, i16) nounwind readnone
494
495define <8 x i8> @test_cmp_w_128(<8 x i16> %a0, <8 x i16> %a1) {
496; CHECK_LABEL: test_cmp_w_128
497; CHECK: vpcmpeqw %xmm1, %xmm0, %k0 ##
498  %res0 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 0, i8 -1)
499  %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
500; CHECK: vpcmpltw %xmm1, %xmm0, %k0 ##
501  %res1 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 1, i8 -1)
502  %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
503; CHECK: vpcmplew %xmm1, %xmm0, %k0 ##
504  %res2 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 2, i8 -1)
505  %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
506; CHECK: vpcmpunordw %xmm1, %xmm0, %k0 ##
507  %res3 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 3, i8 -1)
508  %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
509; CHECK: vpcmpneqw %xmm1, %xmm0, %k0 ##
510  %res4 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 4, i8 -1)
511  %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
512; CHECK: vpcmpnltw %xmm1, %xmm0, %k0 ##
513  %res5 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 5, i8 -1)
514  %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
515; CHECK: vpcmpnlew %xmm1, %xmm0, %k0 ##
516  %res6 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 6, i8 -1)
517  %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
518; CHECK: vpcmpordw %xmm1, %xmm0, %k0 ##
519  %res7 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 7, i8 -1)
520  %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
521  ret <8 x i8> %vec7
522}
523
524define <8 x i8> @test_mask_cmp_w_128(<8 x i16> %a0, <8 x i16> %a1, i8 %mask) {
525; CHECK_LABEL: test_mask_cmp_w_128
526; CHECK: vpcmpeqw %xmm1, %xmm0, %k0 {%k1} ##
527  %res0 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 0, i8 %mask)
528  %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
529; CHECK: vpcmpltw %xmm1, %xmm0, %k0 {%k1} ##
530  %res1 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 1, i8 %mask)
531  %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
532; CHECK: vpcmplew %xmm1, %xmm0, %k0 {%k1} ##
533  %res2 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 2, i8 %mask)
534  %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
535; CHECK: vpcmpunordw %xmm1, %xmm0, %k0 {%k1} ##
536  %res3 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 3, i8 %mask)
537  %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
538; CHECK: vpcmpneqw %xmm1, %xmm0, %k0 {%k1} ##
539  %res4 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 4, i8 %mask)
540  %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
541; CHECK: vpcmpnltw %xmm1, %xmm0, %k0 {%k1} ##
542  %res5 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 5, i8 %mask)
543  %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
544; CHECK: vpcmpnlew %xmm1, %xmm0, %k0 {%k1} ##
545  %res6 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 6, i8 %mask)
546  %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
547; CHECK: vpcmpordw %xmm1, %xmm0, %k0 {%k1} ##
548  %res7 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 7, i8 %mask)
549  %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
550  ret <8 x i8> %vec7
551}
552
553declare i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16>, <8 x i16>, i32, i8) nounwind readnone
554
555define <8 x i8> @test_ucmp_w_128(<8 x i16> %a0, <8 x i16> %a1) {
556; CHECK_LABEL: test_ucmp_w_128
557; CHECK: vpcmpequw %xmm1, %xmm0, %k0 ##
558  %res0 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 0, i8 -1)
559  %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
560; CHECK: vpcmpltuw %xmm1, %xmm0, %k0 ##
561  %res1 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 1, i8 -1)
562  %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
563; CHECK: vpcmpleuw %xmm1, %xmm0, %k0 ##
564  %res2 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 2, i8 -1)
565  %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
566; CHECK: vpcmpunorduw %xmm1, %xmm0, %k0 ##
567  %res3 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 3, i8 -1)
568  %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
569; CHECK: vpcmpnequw %xmm1, %xmm0, %k0 ##
570  %res4 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 4, i8 -1)
571  %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
572; CHECK: vpcmpnltuw %xmm1, %xmm0, %k0 ##
573  %res5 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 5, i8 -1)
574  %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
575; CHECK: vpcmpnleuw %xmm1, %xmm0, %k0 ##
576  %res6 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 6, i8 -1)
577  %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
578; CHECK: vpcmporduw %xmm1, %xmm0, %k0 ##
579  %res7 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 7, i8 -1)
580  %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
581  ret <8 x i8> %vec7
582}
583
584define <8 x i8> @test_mask_ucmp_w_128(<8 x i16> %a0, <8 x i16> %a1, i8 %mask) {
585; CHECK_LABEL: test_mask_ucmp_w_128
586; CHECK: vpcmpequw %xmm1, %xmm0, %k0 {%k1} ##
587  %res0 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 0, i8 %mask)
588  %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
589; CHECK: vpcmpltuw %xmm1, %xmm0, %k0 {%k1} ##
590  %res1 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 1, i8 %mask)
591  %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
592; CHECK: vpcmpleuw %xmm1, %xmm0, %k0 {%k1} ##
593  %res2 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 2, i8 %mask)
594  %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
595; CHECK: vpcmpunorduw %xmm1, %xmm0, %k0 {%k1} ##
596  %res3 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 3, i8 %mask)
597  %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
598; CHECK: vpcmpnequw %xmm1, %xmm0, %k0 {%k1} ##
599  %res4 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 4, i8 %mask)
600  %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
601; CHECK: vpcmpnltuw %xmm1, %xmm0, %k0 {%k1} ##
602  %res5 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 5, i8 %mask)
603  %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
604; CHECK: vpcmpnleuw %xmm1, %xmm0, %k0 {%k1} ##
605  %res6 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 6, i8 %mask)
606  %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
607; CHECK: vpcmporduw %xmm1, %xmm0, %k0 {%k1} ##
608  %res7 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 7, i8 %mask)
609  %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
610  ret <8 x i8> %vec7
611}
612
613declare i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16>, <8 x i16>, i32, i8) nounwind readnone
614
615declare <8 x float> @llvm.x86.fma.mask.vfmadd.ps.256(<8 x float>, <8 x float>, <8 x float>, i8) nounwind readnone
616
617define <8 x float> @test_mask_vfmadd256_ps(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask) {
618  ; CHECK-LABEL: test_mask_vfmadd256_ps
619  ; CHECK: vfmadd213ps %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x29,0xa8,0xc2]
620  %res = call <8 x float> @llvm.x86.fma.mask.vfmadd.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask) nounwind
621  ret <8 x float> %res
622}
623
624declare <4 x float> @llvm.x86.fma.mask.vfmadd.ps.128(<4 x float>, <4 x float>, <4 x float>, i8) nounwind readnone
625
626define <4 x float> @test_mask_vfmadd128_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
627  ; CHECK-LABEL: test_mask_vfmadd128_ps
628  ; CHECK: vfmadd213ps %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xa8,0xc2]
629  %res = call <4 x float> @llvm.x86.fma.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) nounwind
630  ret <4 x float> %res
631}
632
633declare <4 x double> @llvm.x86.fma.mask.vfmadd.pd.256(<4 x double>, <4 x double>, <4 x double>, i8)
634
635define <4 x double> @test_mask_fmadd256_pd(<4 x double> %a, <4 x double> %b, <4 x double> %c, i8 %mask) {
636; CHECK-LABEL: test_mask_fmadd256_pd:
637; CHECK: vfmadd213pd %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x29,0xa8,0xc2]
638  %res = call <4 x double> @llvm.x86.fma.mask.vfmadd.pd.256(<4 x double> %a, <4 x double> %b, <4 x double> %c, i8 %mask)
639  ret <4 x double> %res
640}
641
642declare <2 x double> @llvm.x86.fma.mask.vfmadd.pd.128(<2 x double>, <2 x double>, <2 x double>, i8)
643
644define <2 x double> @test_mask_fmadd128_pd(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
645; CHECK-LABEL: test_mask_fmadd128_pd:
646; CHECK: vfmadd213pd %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xa8,0xc2]
647  %res = call <2 x double> @llvm.x86.fma.mask.vfmadd.pd.128(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask)
648  ret <2 x double> %res
649}
650
651declare <8 x float> @llvm.x86.fma.mask.vfmsub.ps.256(<8 x float>, <8 x float>, <8 x float>, i8) nounwind readnone
652
653define <8 x float> @test_mask_vfmsub256_ps(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask) {
654  ; CHECK-LABEL: test_mask_vfmsub256_ps
655  ; CHECK: vfmsub213ps %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x29,0xaa,0xc2]
656  %res = call <8 x float> @llvm.x86.fma.mask.vfmsub.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask) nounwind
657  ret <8 x float> %res
658}
659
660declare <4 x float> @llvm.x86.fma.mask.vfmsub.ps.128(<4 x float>, <4 x float>, <4 x float>, i8) nounwind readnone
661
662define <4 x float> @test_mask_vfmsub128_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
663  ; CHECK-LABEL: test_mask_vfmsub128_ps
664  ; CHECK: vfmsub213ps %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xaa,0xc2]
665  %res = call <4 x float> @llvm.x86.fma.mask.vfmsub.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) nounwind
666  ret <4 x float> %res
667}
668
669declare <4 x double> @llvm.x86.fma.mask.vfmsub.pd.256(<4 x double>, <4 x double>, <4 x double>, i8) nounwind readnone
670
671define <4 x double> @test_mask_vfmsub256_pd(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) {
672  ; CHECK-LABEL: test_mask_vfmsub256_pd
673  ; CHECK: vfmsub213pd %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x29,0xaa,0xc2]
674  %res = call <4 x double> @llvm.x86.fma.mask.vfmsub.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) nounwind
675  ret <4 x double> %res
676}
677
678declare <2 x double> @llvm.x86.fma.mask.vfmsub.pd.128(<2 x double>, <2 x double>, <2 x double>, i8) nounwind readnone
679
680define <2 x double> @test_mask_vfmsub128_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) {
681  ; CHECK-LABEL: test_mask_vfmsub128_pd
682  ; CHECK: vfmsub213pd %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xaa,0xc2]
683  %res = call <2 x double> @llvm.x86.fma.mask.vfmsub.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) nounwind
684  ret <2 x double> %res
685}
686
687declare <8 x float> @llvm.x86.fma.mask.vfnmadd.ps.256(<8 x float>, <8 x float>, <8 x float>, i8) nounwind readnone
688
689define <8 x float> @test_mask_vfnmadd256_ps(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask) {
690  ; CHECK-LABEL: test_mask_vfnmadd256_ps
691  ; CHECK: vfnmadd213ps %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x29,0xac,0xc2]
692  %res = call <8 x float> @llvm.x86.fma.mask.vfnmadd.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask) nounwind
693  ret <8 x float> %res
694}
695
696declare <4 x float> @llvm.x86.fma.mask.vfnmadd.ps.128(<4 x float>, <4 x float>, <4 x float>, i8) nounwind readnone
697
698define <4 x float> @test_mask_vfnmadd128_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
699  ; CHECK-LABEL: test_mask_vfnmadd128_ps
700  ; CHECK: vfnmadd213ps %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xac,0xc2]
701  %res = call <4 x float> @llvm.x86.fma.mask.vfnmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) nounwind
702  ret <4 x float> %res
703}
704
705declare <4 x double> @llvm.x86.fma.mask.vfnmadd.pd.256(<4 x double>, <4 x double>, <4 x double>, i8) nounwind readnone
706
707define <4 x double> @test_mask_vfnmadd256_pd(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) {
708  ; CHECK-LABEL: test_mask_vfnmadd256_pd
709  ; CHECK: vfnmadd213pd %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x29,0xac,0xc2]
710  %res = call <4 x double> @llvm.x86.fma.mask.vfnmadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) nounwind
711  ret <4 x double> %res
712}
713
714declare <2 x double> @llvm.x86.fma.mask.vfnmadd.pd.128(<2 x double>, <2 x double>, <2 x double>, i8) nounwind readnone
715
716define <2 x double> @test_mask_vfnmadd128_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) {
717  ; CHECK-LABEL: test_mask_vfnmadd128_pd
718  ; CHECK: vfnmadd213pd %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xac,0xc2]
719  %res = call <2 x double> @llvm.x86.fma.mask.vfnmadd.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) nounwind
720  ret <2 x double> %res
721}
722
723declare <8 x float> @llvm.x86.fma.mask.vfnmsub.ps.256(<8 x float>, <8 x float>, <8 x float>, i8) nounwind readnone
724
725define <8 x float> @test_mask_vfnmsub256_ps(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask) {
726  ; CHECK-LABEL: test_mask_vfnmsub256_ps
727  ; CHECK: vfnmsub213ps %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x29,0xae,0xc2]
728  %res = call <8 x float> @llvm.x86.fma.mask.vfnmsub.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask) nounwind
729  ret <8 x float> %res
730}
731
732declare <4 x float> @llvm.x86.fma.mask.vfnmsub.ps.128(<4 x float>, <4 x float>, <4 x float>, i8) nounwind readnone
733
734define <4 x float> @test_mask_vfnmsub128_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
735  ; CHECK-LABEL: test_mask_vfnmsub128_ps
736  ; CHECK: vfnmsub213ps %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xae,0xc2]
737  %res = call <4 x float> @llvm.x86.fma.mask.vfnmsub.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) nounwind
738  ret <4 x float> %res
739}
740
741declare <4 x double> @llvm.x86.fma.mask.vfnmsub.pd.256(<4 x double>, <4 x double>, <4 x double>, i8) nounwind readnone
742
743define <4 x double> @test_mask_vfnmsub256_pd(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) {
744  ; CHECK-LABEL: test_mask_vfnmsub256_pd
745  ; CHECK: vfnmsub213pd %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x29,0xae,0xc2]
746  %res = call <4 x double> @llvm.x86.fma.mask.vfnmsub.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) nounwind
747  ret <4 x double> %res
748}
749
750declare <2 x double> @llvm.x86.fma.mask.vfnmsub.pd.128(<2 x double>, <2 x double>, <2 x double>, i8) nounwind readnone
751
752define <2 x double> @test_mask_vfnmsub128_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) {
753  ; CHECK-LABEL: test_mask_vfnmsub128_pd
754  ; CHECK: vfnmsub213pd %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xae,0xc2]
755  %res = call <2 x double> @llvm.x86.fma.mask.vfnmsub.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) nounwind
756  ret <2 x double> %res
757}
758
759declare <8 x float> @llvm.x86.fma.mask.vfmaddsub.ps.256(<8 x float>, <8 x float>, <8 x float>, i8) nounwind readnone
760
761define <8 x float> @test_mask_fmaddsub256_ps(<8 x float> %a, <8 x float> %b, <8 x float> %c, i8 %mask) {
762; CHECK-LABEL: test_mask_fmaddsub256_ps:
763; CHECK: vfmaddsub213ps %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x29,0xa6,0xc2]
764  %res = call <8 x float> @llvm.x86.fma.mask.vfmaddsub.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %c, i8 %mask)
765  ret <8 x float> %res
766}
767
768declare <4 x float> @llvm.x86.fma.mask.vfmaddsub.ps.128(<4 x float>, <4 x float>, <4 x float>, i8) nounwind readnone
769
770define <4 x float> @test_mask_fmaddsub128_ps(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
771; CHECK-LABEL: test_mask_fmaddsub128_ps:
772; CHECK: vfmaddsub213ps %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xa6,0xc2]
773  %res = call <4 x float> @llvm.x86.fma.mask.vfmaddsub.ps.128(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask)
774  ret <4 x float> %res
775}
776
777declare <4 x double> @llvm.x86.fma.mask.vfmaddsub.pd.256(<4 x double>, <4 x double>, <4 x double>, i8) nounwind readnone
778
779define <4 x double> @test_mask_vfmaddsub256_pd(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) {
780  ; CHECK-LABEL: test_mask_vfmaddsub256_pd
781  ; CHECK: vfmaddsub213pd %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x29,0xa6,0xc2]
782  %res = call <4 x double> @llvm.x86.fma.mask.vfmaddsub.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) nounwind
783  ret <4 x double> %res
784}
785
786declare <2 x double> @llvm.x86.fma.mask.vfmaddsub.pd.128(<2 x double>, <2 x double>, <2 x double>, i8) nounwind readnone
787
788define <2 x double> @test_mask_vfmaddsub128_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) {
789  ; CHECK-LABEL: test_mask_vfmaddsub128_pd
790  ; CHECK: vfmaddsub213pd %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xa6,0xc2]
791  %res = call <2 x double> @llvm.x86.fma.mask.vfmaddsub.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) nounwind
792  ret <2 x double> %res
793}
794
795declare <8 x float> @llvm.x86.fma.mask.vfmsubadd.ps.256(<8 x float>, <8 x float>, <8 x float>, i8) nounwind readnone
796
797define <8 x float> @test_mask_vfmsubadd256_ps(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask) {
798  ; CHECK-LABEL: test_mask_vfmsubadd256_ps
799  ; CHECK: vfmsubadd213ps %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x29,0xa7,0xc2]
800  %res = call <8 x float> @llvm.x86.fma.mask.vfmsubadd.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask) nounwind
801  ret <8 x float> %res
802}
803
804declare <4 x float> @llvm.x86.fma.mask.vfmsubadd.ps.128(<4 x float>, <4 x float>, <4 x float>, i8) nounwind readnone
805
806define <4 x float> @test_mask_vfmsubadd128_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
807  ; CHECK-LABEL: test_mask_vfmsubadd128_ps
808  ; CHECK: vfmsubadd213ps %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xa7,0xc2]
809  %res = call <4 x float> @llvm.x86.fma.mask.vfmsubadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) nounwind
810  ret <4 x float> %res
811}
812
813declare <4 x double> @llvm.x86.fma.mask.vfmsubadd.pd.256(<4 x double>, <4 x double>, <4 x double>, i8) nounwind readnone
814
815define <4 x double> @test_mask_vfmsubadd256_pd(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) {
816  ; CHECK-LABEL: test_mask_vfmsubadd256_pd
817  ; CHECK: vfmsubadd213pd %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x29,0xa7,0xc2]
818  %res = call <4 x double> @llvm.x86.fma.mask.vfmsubadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) nounwind
819  ret <4 x double> %res
820}
821declare <2 x double> @llvm.x86.fma.mask.vfmsubadd.pd.128(<2 x double>, <2 x double>, <2 x double>, i8) nounwind readnone
822
823define <2 x double> @test_mask_vfmsubadd128_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) {
824  ; CHECK-LABEL: test_mask_vfmsubadd128_pd
825  ; CHECK: vfmsubadd213pd %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xa7,0xc2]
826  %res = call <2 x double> @llvm.x86.fma.mask.vfmsubadd.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) nounwind
827  ret <2 x double> %res
828}
829
830define <2 x double> @test_mask_vfmsubadd128rm_pd(<2 x double> %a0, <2 x double> %a1, <2 x double>* %ptr_a2, i8 %mask) {
831  ; CHECK-LABEL: test_mask_vfmsubadd128rm_pd
832  ; CHECK: vfmsubadd213pd (%rdi), %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xa7,0x07]
833  %a2 = load <2 x double>* %ptr_a2
834  %res = call <2 x double> @llvm.x86.fma.mask.vfmsubadd.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) nounwind
835  ret <2 x double> %res
836}
837declare <8 x double> @llvm.x86.fma.mask.vfmsubadd.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32) nounwind readnone
838define <8 x double> @test_mask_vfmsubaddrm_pd(<8 x double> %a0, <8 x double> %a1, <8 x double>* %ptr_a2, i8 %mask) {
839  ; CHECK-LABEL: test_mask_vfmsubaddrm_pd
840  ; CHECK: vfmsubadd213pd  (%rdi), %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x49,0xa7,0x07]
841  %a2 = load <8 x double>* %ptr_a2, align 8
842  %res = call <8 x double> @llvm.x86.fma.mask.vfmsubadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 4) nounwind
843  ret <8 x double> %res
844}
845
846define <4 x float> @test_mask_vfmadd128_ps_r(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
847  ; CHECK-LABEL: test_mask_vfmadd128_ps_r
848  ; CHECK: vfmadd213ps %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xa8,0xc2]
849  %res = call <4 x float> @llvm.x86.fma.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) nounwind
850  ret <4 x float> %res
851}
852
853define <4 x float> @test_mask_vfmadd128_ps_rz(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
854  ; CHECK-LABEL: test_mask_vfmadd128_ps_rz
855  ; CHECK: vfmadd213ps %xmm2, %xmm1, %xmm0 ## encoding: [0x62,0xf2,0x75,0x08,0xa8,0xc2]
856  %res = call <4 x float> @llvm.x86.fma.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 -1) nounwind
857  ret <4 x float> %res
858}
859
860define <4 x float> @test_mask_vfmadd128_ps_rmk(<4 x float> %a0, <4 x float> %a1, <4 x float>* %ptr_a2, i8 %mask) {
861  ; CHECK-LABEL: test_mask_vfmadd128_ps_rmk
862  ; CHECK: vfmadd213ps	(%rdi), %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xa8,0x07]
863  %a2 = load <4 x float>* %ptr_a2
864  %res = call <4 x float> @llvm.x86.fma.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) nounwind
865  ret <4 x float> %res
866}
867
868define <4 x float> @test_mask_vfmadd128_ps_rmka(<4 x float> %a0, <4 x float> %a1, <4 x float>* %ptr_a2, i8 %mask) {
869  ; CHECK-LABEL: test_mask_vfmadd128_ps_rmka
870  ; CHECK: vfmadd213ps     (%rdi), %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xa8,0x07]
871  %a2 = load <4 x float>* %ptr_a2, align 8
872  %res = call <4 x float> @llvm.x86.fma.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) nounwind
873  ret <4 x float> %res
874}
875
876define <4 x float> @test_mask_vfmadd128_ps_rmkz(<4 x float> %a0, <4 x float> %a1, <4 x float>* %ptr_a2) {
877  ; CHECK-LABEL: test_mask_vfmadd128_ps_rmkz
878  ; CHECK: vfmadd213ps	(%rdi), %xmm1, %xmm0 ## encoding: [0xc4,0xe2,0x71,0xa8,0x07]
879  %a2 = load <4 x float>* %ptr_a2
880  %res = call <4 x float> @llvm.x86.fma.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 -1) nounwind
881  ret <4 x float> %res
882}
883
884define <4 x float> @test_mask_vfmadd128_ps_rmkza(<4 x float> %a0, <4 x float> %a1, <4 x float>* %ptr_a2) {
885  ; CHECK-LABEL: test_mask_vfmadd128_ps_rmkza
886  ; CHECK: vfmadd213ps	(%rdi), %xmm1, %xmm0 ## encoding: [0xc4,0xe2,0x71,0xa8,0x07]
887  %a2 = load <4 x float>* %ptr_a2, align 4
888  %res = call <4 x float> @llvm.x86.fma.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 -1) nounwind
889  ret <4 x float> %res
890}
891
892define <4 x float> @test_mask_vfmadd128_ps_rmb(<4 x float> %a0, <4 x float> %a1, float* %ptr_a2, i8 %mask) {
893  ; CHECK-LABEL: test_mask_vfmadd128_ps_rmb
894  ; CHECK: vfmadd213ps	(%rdi){1to4}, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x19,0xa8,0x07]
895  %q = load float* %ptr_a2
896  %vecinit.i = insertelement <4 x float> undef, float %q, i32 0
897  %vecinit2.i = insertelement <4 x float> %vecinit.i, float %q, i32 1
898  %vecinit4.i = insertelement <4 x float> %vecinit2.i, float %q, i32 2
899  %vecinit6.i = insertelement <4 x float> %vecinit4.i, float %q, i32 3
900  %res = call <4 x float> @llvm.x86.fma.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %vecinit6.i, i8 %mask) nounwind
901  ret <4 x float> %res
902}
903
904define <4 x float> @test_mask_vfmadd128_ps_rmba(<4 x float> %a0, <4 x float> %a1, float* %ptr_a2, i8 %mask) {
905  ; CHECK-LABEL: test_mask_vfmadd128_ps_rmba
906  ; CHECK: vfmadd213ps	(%rdi){1to4}, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x19,0xa8,0x07]
907  %q = load float* %ptr_a2, align 4
908  %vecinit.i = insertelement <4 x float> undef, float %q, i32 0
909  %vecinit2.i = insertelement <4 x float> %vecinit.i, float %q, i32 1
910  %vecinit4.i = insertelement <4 x float> %vecinit2.i, float %q, i32 2
911  %vecinit6.i = insertelement <4 x float> %vecinit4.i, float %q, i32 3
912  %res = call <4 x float> @llvm.x86.fma.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %vecinit6.i, i8 %mask) nounwind
913  ret <4 x float> %res
914}
915
916define <4 x float> @test_mask_vfmadd128_ps_rmbz(<4 x float> %a0, <4 x float> %a1, float* %ptr_a2) {
917  ; CHECK-LABEL: test_mask_vfmadd128_ps_rmbz
918  ; CHECK: vfmadd213ps	(%rdi){1to4}, %xmm1, %xmm0  ## encoding: [0x62,0xf2,0x75,0x18,0xa8,0x07]
919  %q = load float* %ptr_a2
920  %vecinit.i = insertelement <4 x float> undef, float %q, i32 0
921  %vecinit2.i = insertelement <4 x float> %vecinit.i, float %q, i32 1
922  %vecinit4.i = insertelement <4 x float> %vecinit2.i, float %q, i32 2
923  %vecinit6.i = insertelement <4 x float> %vecinit4.i, float %q, i32 3
924  %res = call <4 x float> @llvm.x86.fma.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %vecinit6.i, i8 -1) nounwind
925  ret <4 x float> %res
926}
927
928define <4 x float> @test_mask_vfmadd128_ps_rmbza(<4 x float> %a0, <4 x float> %a1, float* %ptr_a2) {
929  ; CHECK-LABEL: test_mask_vfmadd128_ps_rmbza
930  ; CHECK: vfmadd213ps	(%rdi){1to4}, %xmm1, %xmm0  ## encoding: [0x62,0xf2,0x75,0x18,0xa8,0x07]
931  %q = load float* %ptr_a2, align 4
932  %vecinit.i = insertelement <4 x float> undef, float %q, i32 0
933  %vecinit2.i = insertelement <4 x float> %vecinit.i, float %q, i32 1
934  %vecinit4.i = insertelement <4 x float> %vecinit2.i, float %q, i32 2
935  %vecinit6.i = insertelement <4 x float> %vecinit4.i, float %q, i32 3
936  %res = call <4 x float> @llvm.x86.fma.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %vecinit6.i, i8 -1) nounwind
937  ret <4 x float> %res
938}
939
940define <2 x double> @test_mask_vfmadd128_pd_r(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) {
941  ; CHECK-LABEL: test_mask_vfmadd128_pd_r
942  ; CHECK: vfmadd213pd %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xa8,0xc2]
943  %res = call <2 x double> @llvm.x86.fma.mask.vfmadd.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) nounwind
944  ret <2 x double> %res
945}
946
947define <2 x double> @test_mask_vfmadd128_pd_rz(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {
948  ; CHECK-LABEL: test_mask_vfmadd128_pd_rz
949  ; CHECK: vfmadd213pd %xmm2, %xmm1, %xmm0 ## encoding: [0x62,0xf2,0xf5,0x08,0xa8,0xc2]
950  %res = call <2 x double> @llvm.x86.fma.mask.vfmadd.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 -1) nounwind
951  ret <2 x double> %res
952}
953
954define <2 x double> @test_mask_vfmadd128_pd_rmk(<2 x double> %a0, <2 x double> %a1, <2 x double>* %ptr_a2, i8 %mask) {
955  ; CHECK-LABEL: test_mask_vfmadd128_pd_rmk
956  ; CHECK: vfmadd213pd	(%rdi), %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xa8,0x07]
957  %a2 = load <2 x double>* %ptr_a2
958  %res = call <2 x double> @llvm.x86.fma.mask.vfmadd.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) nounwind
959  ret <2 x double> %res
960}
961
962define <2 x double> @test_mask_vfmadd128_pd_rmkz(<2 x double> %a0, <2 x double> %a1, <2 x double>* %ptr_a2) {
963  ; CHECK-LABEL: test_mask_vfmadd128_pd_rmkz
964  ; CHECK: vfmadd213pd	(%rdi), %xmm1, %xmm0 ## encoding: [0xc4,0xe2,0xf1,0xa8,0x07]
965  %a2 = load <2 x double>* %ptr_a2
966  %res = call <2 x double> @llvm.x86.fma.mask.vfmadd.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 -1) nounwind
967  ret <2 x double> %res
968}
969
970define <4 x double> @test_mask_vfmadd256_pd_r(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) {
971  ; CHECK-LABEL: test_mask_vfmadd256_pd_r
972  ; CHECK: vfmadd213pd %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x29,0xa8,0xc2]
973  %res = call <4 x double> @llvm.x86.fma.mask.vfmadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) nounwind
974  ret <4 x double> %res
975}
976
977define <4 x double> @test_mask_vfmadd256_pd_rz(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) {
978  ; CHECK-LABEL: test_mask_vfmadd256_pd_rz
979  ; CHECK: vfmadd213pd %ymm2, %ymm1, %ymm0 ## encoding: [0x62,0xf2,0xf5,0x28,0xa8,0xc2]
980  %res = call <4 x double> @llvm.x86.fma.mask.vfmadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 -1) nounwind
981  ret <4 x double> %res
982}
983
984define <4 x double> @test_mask_vfmadd256_pd_rmk(<4 x double> %a0, <4 x double> %a1, <4 x double>* %ptr_a2, i8 %mask) {
985  ; CHECK-LABEL: test_mask_vfmadd256_pd_rmk
986  ; CHECK: vfmadd213pd	(%rdi), %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x29,0xa8,0x07]
987  %a2 = load <4 x double>* %ptr_a2
988  %res = call <4 x double> @llvm.x86.fma.mask.vfmadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) nounwind
989  ret <4 x double> %res
990}
991
992define <4 x double> @test_mask_vfmadd256_pd_rmkz(<4 x double> %a0, <4 x double> %a1, <4 x double>* %ptr_a2) {
993  ; CHECK-LABEL: test_mask_vfmadd256_pd_rmkz
994  ; CHECK: vfmadd213pd	(%rdi), %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0xf5,0xa8,0x07]
995  %a2 = load <4 x double>* %ptr_a2
996  %res = call <4 x double> @llvm.x86.fma.mask.vfmadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 -1) nounwind
997  ret <4 x double> %res
998}
999