1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+avx512f --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,X86
3; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,X64
4
5; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/avx512f-builtins.c
6
7define <8 x double> @test_mm512_fmadd_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
8; CHECK-LABEL: test_mm512_fmadd_round_pd:
9; CHECK:       ## %bb.0: ## %entry
10; CHECK-NEXT:    vfmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0xf5,0x18,0xa8,0xc2]
11; CHECK-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
12entry:
13  %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i32 8)
14  ret <8 x double> %0
15}
16
17declare <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double>, <8 x double>, <8 x double>, i32) #1
18
19define <8 x double> @test_mm512_mask_fmadd_round_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) {
20; X86-LABEL: test_mm512_mask_fmadd_round_pd:
21; X86:       ## %bb.0: ## %entry
22; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04]
23; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
24; X86-NEXT:    vfmadd132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1} ## encoding: [0x62,0xf2,0xed,0x19,0x98,0xc1]
25; X86-NEXT:    retl ## encoding: [0xc3]
26;
27; X64-LABEL: test_mm512_mask_fmadd_round_pd:
28; X64:       ## %bb.0: ## %entry
29; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
30; X64-NEXT:    vfmadd132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1} ## encoding: [0x62,0xf2,0xed,0x19,0x98,0xc1]
31; X64-NEXT:    retq ## encoding: [0xc3]
32entry:
33  %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i32 8)
34  %1 = bitcast i8 %__U to <8 x i1>
35  %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__A
36  ret <8 x double> %2
37}
38
39define <8 x double> @test_mm512_mask3_fmadd_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) {
40; X86-LABEL: test_mm512_mask3_fmadd_round_pd:
41; X86:       ## %bb.0: ## %entry
42; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04]
43; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
44; X86-NEXT:    vfmadd231pd {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x19,0xb8,0xd1]
45; X86-NEXT:    vmovapd %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x28,0xc2]
46; X86-NEXT:    retl ## encoding: [0xc3]
47;
48; X64-LABEL: test_mm512_mask3_fmadd_round_pd:
49; X64:       ## %bb.0: ## %entry
50; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
51; X64-NEXT:    vfmadd231pd {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x19,0xb8,0xd1]
52; X64-NEXT:    vmovapd %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x28,0xc2]
53; X64-NEXT:    retq ## encoding: [0xc3]
54entry:
55  %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i32 8)
56  %1 = bitcast i8 %__U to <8 x i1>
57  %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__C
58  ret <8 x double> %2
59}
60
61define <8 x double> @test_mm512_maskz_fmadd_round_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
62; X86-LABEL: test_mm512_maskz_fmadd_round_pd:
63; X86:       ## %bb.0: ## %entry
64; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04]
65; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
66; X86-NEXT:    vfmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xf5,0x99,0xa8,0xc2]
67; X86-NEXT:    retl ## encoding: [0xc3]
68;
69; X64-LABEL: test_mm512_maskz_fmadd_round_pd:
70; X64:       ## %bb.0: ## %entry
71; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
72; X64-NEXT:    vfmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xf5,0x99,0xa8,0xc2]
73; X64-NEXT:    retq ## encoding: [0xc3]
74entry:
75  %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i32 8)
76  %1 = bitcast i8 %__U to <8 x i1>
77  %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer
78  ret <8 x double> %2
79}
80
81define <8 x double> @test_mm512_fmsub_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
82; CHECK-LABEL: test_mm512_fmsub_round_pd:
83; CHECK:       ## %bb.0: ## %entry
84; CHECK-NEXT:    vfmsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0xf5,0x18,0xaa,0xc2]
85; CHECK-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
86entry:
87  %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
88  %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub, i32 8)
89  ret <8 x double> %0
90}
91
92define <8 x double> @test_mm512_mask_fmsub_round_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) {
93; X86-LABEL: test_mm512_mask_fmsub_round_pd:
94; X86:       ## %bb.0: ## %entry
95; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04]
96; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
97; X86-NEXT:    vfmsub132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1} ## encoding: [0x62,0xf2,0xed,0x19,0x9a,0xc1]
98; X86-NEXT:    retl ## encoding: [0xc3]
99;
100; X64-LABEL: test_mm512_mask_fmsub_round_pd:
101; X64:       ## %bb.0: ## %entry
102; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
103; X64-NEXT:    vfmsub132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1} ## encoding: [0x62,0xf2,0xed,0x19,0x9a,0xc1]
104; X64-NEXT:    retq ## encoding: [0xc3]
105entry:
106  %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
107  %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub, i32 8)
108  %1 = bitcast i8 %__U to <8 x i1>
109  %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__A
110  ret <8 x double> %2
111}
112
113define <8 x double> @test_mm512_maskz_fmsub_round_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
114; X86-LABEL: test_mm512_maskz_fmsub_round_pd:
115; X86:       ## %bb.0: ## %entry
116; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04]
117; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
118; X86-NEXT:    vfmsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xf5,0x99,0xaa,0xc2]
119; X86-NEXT:    retl ## encoding: [0xc3]
120;
121; X64-LABEL: test_mm512_maskz_fmsub_round_pd:
122; X64:       ## %bb.0: ## %entry
123; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
124; X64-NEXT:    vfmsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xf5,0x99,0xaa,0xc2]
125; X64-NEXT:    retq ## encoding: [0xc3]
126entry:
127  %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
128  %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub, i32 8)
129  %1 = bitcast i8 %__U to <8 x i1>
130  %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer
131  ret <8 x double> %2
132}
133
134define <8 x double> @test_mm512_fnmadd_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
135; CHECK-LABEL: test_mm512_fnmadd_round_pd:
136; CHECK:       ## %bb.0: ## %entry
137; CHECK-NEXT:    vfnmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0xf5,0x18,0xac,0xc2]
138; CHECK-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
139entry:
140  %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A
141  %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %sub, <8 x double> %__B, <8 x double> %__C, i32 8)
142  ret <8 x double> %0
143}
144
145define <8 x double> @test_mm512_mask3_fnmadd_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) {
146; X86-LABEL: test_mm512_mask3_fnmadd_round_pd:
147; X86:       ## %bb.0: ## %entry
148; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04]
149; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
150; X86-NEXT:    vfnmadd231pd {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x19,0xbc,0xd1]
151; X86-NEXT:    vmovapd %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x28,0xc2]
152; X86-NEXT:    retl ## encoding: [0xc3]
153;
154; X64-LABEL: test_mm512_mask3_fnmadd_round_pd:
155; X64:       ## %bb.0: ## %entry
156; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
157; X64-NEXT:    vfnmadd231pd {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x19,0xbc,0xd1]
158; X64-NEXT:    vmovapd %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x28,0xc2]
159; X64-NEXT:    retq ## encoding: [0xc3]
160entry:
161  %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A
162  %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %sub, <8 x double> %__B, <8 x double> %__C, i32 8)
163  %1 = bitcast i8 %__U to <8 x i1>
164  %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__C
165  ret <8 x double> %2
166}
167
168define <8 x double> @test_mm512_maskz_fnmadd_round_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
169; X86-LABEL: test_mm512_maskz_fnmadd_round_pd:
170; X86:       ## %bb.0: ## %entry
171; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04]
172; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
173; X86-NEXT:    vfnmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xf5,0x99,0xac,0xc2]
174; X86-NEXT:    retl ## encoding: [0xc3]
175;
176; X64-LABEL: test_mm512_maskz_fnmadd_round_pd:
177; X64:       ## %bb.0: ## %entry
178; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
179; X64-NEXT:    vfnmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xf5,0x99,0xac,0xc2]
180; X64-NEXT:    retq ## encoding: [0xc3]
181entry:
182  %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A
183  %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %sub, <8 x double> %__B, <8 x double> %__C, i32 8)
184  %1 = bitcast i8 %__U to <8 x i1>
185  %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer
186  ret <8 x double> %2
187}
188
189define <8 x double> @test_mm512_fnmsub_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
190; CHECK-LABEL: test_mm512_fnmsub_round_pd:
191; CHECK:       ## %bb.0: ## %entry
192; CHECK-NEXT:    vfnmsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0xf5,0x18,0xae,0xc2]
193; CHECK-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
194entry:
195  %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A
196  %sub1 = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
197  %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %sub, <8 x double> %__B, <8 x double> %sub1, i32 8)
198  ret <8 x double> %0
199}
200
201define <8 x double> @test_mm512_maskz_fnmsub_round_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
202; X86-LABEL: test_mm512_maskz_fnmsub_round_pd:
203; X86:       ## %bb.0: ## %entry
204; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04]
205; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
206; X86-NEXT:    vfnmsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xf5,0x99,0xae,0xc2]
207; X86-NEXT:    retl ## encoding: [0xc3]
208;
209; X64-LABEL: test_mm512_maskz_fnmsub_round_pd:
210; X64:       ## %bb.0: ## %entry
211; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
212; X64-NEXT:    vfnmsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xf5,0x99,0xae,0xc2]
213; X64-NEXT:    retq ## encoding: [0xc3]
214entry:
215  %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A
216  %sub1 = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
217  %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %sub, <8 x double> %__B, <8 x double> %sub1, i32 8)
218  %1 = bitcast i8 %__U to <8 x i1>
219  %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer
220  ret <8 x double> %2
221}
222
223define <8 x double> @test_mm512_fmadd_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
224; CHECK-LABEL: test_mm512_fmadd_pd:
225; CHECK:       ## %bb.0: ## %entry
226; CHECK-NEXT:    vfmadd213pd %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0xf5,0x48,0xa8,0xc2]
227; CHECK-NEXT:    ## zmm0 = (zmm1 * zmm0) + zmm2
228; CHECK-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
229entry:
230  %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) #10
231  ret <8 x double> %0
232}
233
234define <8 x double> @test_mm512_mask_fmadd_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) {
235; X86-LABEL: test_mm512_mask_fmadd_pd:
236; X86:       ## %bb.0: ## %entry
237; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04]
238; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
239; X86-NEXT:    vfmadd132pd %zmm1, %zmm2, %zmm0 {%k1} ## encoding: [0x62,0xf2,0xed,0x49,0x98,0xc1]
240; X86-NEXT:    ## zmm0 {%k1} = (zmm0 * zmm1) + zmm2
241; X86-NEXT:    retl ## encoding: [0xc3]
242;
243; X64-LABEL: test_mm512_mask_fmadd_pd:
244; X64:       ## %bb.0: ## %entry
245; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
246; X64-NEXT:    vfmadd132pd %zmm1, %zmm2, %zmm0 {%k1} ## encoding: [0x62,0xf2,0xed,0x49,0x98,0xc1]
247; X64-NEXT:    ## zmm0 {%k1} = (zmm0 * zmm1) + zmm2
248; X64-NEXT:    retq ## encoding: [0xc3]
249entry:
250  %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) #10
251  %1 = bitcast i8 %__U to <8 x i1>
252  %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__A
253  ret <8 x double> %2
254}
255
256define <8 x double> @test_mm512_mask3_fmadd_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) {
257; X86-LABEL: test_mm512_mask3_fmadd_pd:
258; X86:       ## %bb.0: ## %entry
259; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04]
260; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
261; X86-NEXT:    vfmadd231pd %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0xb8,0xd1]
262; X86-NEXT:    ## zmm2 {%k1} = (zmm0 * zmm1) + zmm2
263; X86-NEXT:    vmovapd %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x28,0xc2]
264; X86-NEXT:    retl ## encoding: [0xc3]
265;
266; X64-LABEL: test_mm512_mask3_fmadd_pd:
267; X64:       ## %bb.0: ## %entry
268; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
269; X64-NEXT:    vfmadd231pd %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0xb8,0xd1]
270; X64-NEXT:    ## zmm2 {%k1} = (zmm0 * zmm1) + zmm2
271; X64-NEXT:    vmovapd %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x28,0xc2]
272; X64-NEXT:    retq ## encoding: [0xc3]
273entry:
274  %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) #10
275  %1 = bitcast i8 %__U to <8 x i1>
276  %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__C
277  ret <8 x double> %2
278}
279
280define <8 x double> @test_mm512_maskz_fmadd_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
281; X86-LABEL: test_mm512_maskz_fmadd_pd:
282; X86:       ## %bb.0: ## %entry
283; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04]
284; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
285; X86-NEXT:    vfmadd213pd %zmm2, %zmm1, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xf5,0xc9,0xa8,0xc2]
286; X86-NEXT:    ## zmm0 {%k1} {z} = (zmm1 * zmm0) + zmm2
287; X86-NEXT:    retl ## encoding: [0xc3]
288;
289; X64-LABEL: test_mm512_maskz_fmadd_pd:
290; X64:       ## %bb.0: ## %entry
291; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
292; X64-NEXT:    vfmadd213pd %zmm2, %zmm1, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xf5,0xc9,0xa8,0xc2]
293; X64-NEXT:    ## zmm0 {%k1} {z} = (zmm1 * zmm0) + zmm2
294; X64-NEXT:    retq ## encoding: [0xc3]
295entry:
296  %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) #10
297  %1 = bitcast i8 %__U to <8 x i1>
298  %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer
299  ret <8 x double> %2
300}
301
302define <8 x double> @test_mm512_fmsub_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
303; CHECK-LABEL: test_mm512_fmsub_pd:
304; CHECK:       ## %bb.0: ## %entry
305; CHECK-NEXT:    vfmsub213pd %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0xf5,0x48,0xaa,0xc2]
306; CHECK-NEXT:    ## zmm0 = (zmm1 * zmm0) - zmm2
307; CHECK-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
308entry:
309  %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
310  %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub.i) #10
311  ret <8 x double> %0
312}
313
314define <8 x double> @test_mm512_mask_fmsub_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) {
315; X86-LABEL: test_mm512_mask_fmsub_pd:
316; X86:       ## %bb.0: ## %entry
317; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04]
318; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
319; X86-NEXT:    vfmsub132pd %zmm1, %zmm2, %zmm0 {%k1} ## encoding: [0x62,0xf2,0xed,0x49,0x9a,0xc1]
320; X86-NEXT:    ## zmm0 {%k1} = (zmm0 * zmm1) - zmm2
321; X86-NEXT:    retl ## encoding: [0xc3]
322;
323; X64-LABEL: test_mm512_mask_fmsub_pd:
324; X64:       ## %bb.0: ## %entry
325; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
326; X64-NEXT:    vfmsub132pd %zmm1, %zmm2, %zmm0 {%k1} ## encoding: [0x62,0xf2,0xed,0x49,0x9a,0xc1]
327; X64-NEXT:    ## zmm0 {%k1} = (zmm0 * zmm1) - zmm2
328; X64-NEXT:    retq ## encoding: [0xc3]
329entry:
330  %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
331  %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub.i) #10
332  %1 = bitcast i8 %__U to <8 x i1>
333  %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__A
334  ret <8 x double> %2
335}
336
337define <8 x double> @test_mm512_maskz_fmsub_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
338; X86-LABEL: test_mm512_maskz_fmsub_pd:
339; X86:       ## %bb.0: ## %entry
340; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04]
341; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
342; X86-NEXT:    vfmsub213pd %zmm2, %zmm1, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xf5,0xc9,0xaa,0xc2]
343; X86-NEXT:    ## zmm0 {%k1} {z} = (zmm1 * zmm0) - zmm2
344; X86-NEXT:    retl ## encoding: [0xc3]
345;
346; X64-LABEL: test_mm512_maskz_fmsub_pd:
347; X64:       ## %bb.0: ## %entry
348; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
349; X64-NEXT:    vfmsub213pd %zmm2, %zmm1, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xf5,0xc9,0xaa,0xc2]
350; X64-NEXT:    ## zmm0 {%k1} {z} = (zmm1 * zmm0) - zmm2
351; X64-NEXT:    retq ## encoding: [0xc3]
352entry:
353  %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
354  %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub.i) #10
355  %1 = bitcast i8 %__U to <8 x i1>
356  %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer
357  ret <8 x double> %2
358}
359
360define <8 x double> @test_mm512_fnmadd_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
361; CHECK-LABEL: test_mm512_fnmadd_pd:
362; CHECK:       ## %bb.0: ## %entry
363; CHECK-NEXT:    vfnmadd213pd %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0xf5,0x48,0xac,0xc2]
364; CHECK-NEXT:    ## zmm0 = -(zmm1 * zmm0) + zmm2
365; CHECK-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
366entry:
367  %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A
368  %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %sub.i, <8 x double> %__B, <8 x double> %__C) #10
369  ret <8 x double> %0
370}
371
372define <8 x double> @test_mm512_mask3_fnmadd_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) {
373; X86-LABEL: test_mm512_mask3_fnmadd_pd:
374; X86:       ## %bb.0: ## %entry
375; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04]
376; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
377; X86-NEXT:    vfnmadd231pd %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0xbc,0xd1]
378; X86-NEXT:    ## zmm2 {%k1} = -(zmm0 * zmm1) + zmm2
379; X86-NEXT:    vmovapd %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x28,0xc2]
380; X86-NEXT:    retl ## encoding: [0xc3]
381;
382; X64-LABEL: test_mm512_mask3_fnmadd_pd:
383; X64:       ## %bb.0: ## %entry
384; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
385; X64-NEXT:    vfnmadd231pd %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0xbc,0xd1]
386; X64-NEXT:    ## zmm2 {%k1} = -(zmm0 * zmm1) + zmm2
387; X64-NEXT:    vmovapd %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x28,0xc2]
388; X64-NEXT:    retq ## encoding: [0xc3]
389entry:
390  %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A
391  %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %sub.i, <8 x double> %__B, <8 x double> %__C) #10
392  %1 = bitcast i8 %__U to <8 x i1>
393  %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__C
394  ret <8 x double> %2
395}
396
397define <8 x double> @test_mm512_maskz_fnmadd_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
398; X86-LABEL: test_mm512_maskz_fnmadd_pd:
399; X86:       ## %bb.0: ## %entry
400; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04]
401; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
402; X86-NEXT:    vfnmadd213pd %zmm2, %zmm1, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xf5,0xc9,0xac,0xc2]
403; X86-NEXT:    ## zmm0 {%k1} {z} = -(zmm1 * zmm0) + zmm2
404; X86-NEXT:    retl ## encoding: [0xc3]
405;
406; X64-LABEL: test_mm512_maskz_fnmadd_pd:
407; X64:       ## %bb.0: ## %entry
408; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
409; X64-NEXT:    vfnmadd213pd %zmm2, %zmm1, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xf5,0xc9,0xac,0xc2]
410; X64-NEXT:    ## zmm0 {%k1} {z} = -(zmm1 * zmm0) + zmm2
411; X64-NEXT:    retq ## encoding: [0xc3]
412entry:
413  %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A
414  %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %sub.i, <8 x double> %__B, <8 x double> %__C) #10
415  %1 = bitcast i8 %__U to <8 x i1>
416  %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer
417  ret <8 x double> %2
418}
419
420define <8 x double> @test_mm512_fnmsub_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
421; CHECK-LABEL: test_mm512_fnmsub_pd:
422; CHECK:       ## %bb.0: ## %entry
423; CHECK-NEXT:    vfnmsub213pd %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0xf5,0x48,0xae,0xc2]
424; CHECK-NEXT:    ## zmm0 = -(zmm1 * zmm0) - zmm2
425; CHECK-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
426entry:
427  %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A
428  %sub1.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
429  %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %sub.i, <8 x double> %__B, <8 x double> %sub1.i) #10
430  ret <8 x double> %0
431}
432
433define <8 x double> @test_mm512_maskz_fnmsub_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
434; X86-LABEL: test_mm512_maskz_fnmsub_pd:
435; X86:       ## %bb.0: ## %entry
436; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04]
437; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
438; X86-NEXT:    vfnmsub213pd %zmm2, %zmm1, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xf5,0xc9,0xae,0xc2]
439; X86-NEXT:    ## zmm0 {%k1} {z} = -(zmm1 * zmm0) - zmm2
440; X86-NEXT:    retl ## encoding: [0xc3]
441;
442; X64-LABEL: test_mm512_maskz_fnmsub_pd:
443; X64:       ## %bb.0: ## %entry
444; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
445; X64-NEXT:    vfnmsub213pd %zmm2, %zmm1, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xf5,0xc9,0xae,0xc2]
446; X64-NEXT:    ## zmm0 {%k1} {z} = -(zmm1 * zmm0) - zmm2
447; X64-NEXT:    retq ## encoding: [0xc3]
448entry:
449  %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A
450  %sub1.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
451  %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %sub.i, <8 x double> %__B, <8 x double> %sub1.i) #10
452  %1 = bitcast i8 %__U to <8 x i1>
453  %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer
454  ret <8 x double> %2
455}
456
457define <16 x float> @test_mm512_fmadd_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
458; CHECK-LABEL: test_mm512_fmadd_round_ps:
459; CHECK:       ## %bb.0: ## %entry
460; CHECK-NEXT:    vfmadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0x75,0x18,0xa8,0xc2]
461; CHECK-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
462entry:
463  %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i32 8)
464  ret <16 x float> %0
465}
466
467declare <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float>, <16 x float>, <16 x float>, i32) #1
468
469define <16 x float> @test_mm512_mask_fmadd_round_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) {
470; X86-LABEL: test_mm512_mask_fmadd_round_ps:
471; X86:       ## %bb.0: ## %entry
472; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
473; X86-NEXT:    vfmadd132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1} ## encoding: [0x62,0xf2,0x6d,0x19,0x98,0xc1]
474; X86-NEXT:    retl ## encoding: [0xc3]
475;
476; X64-LABEL: test_mm512_mask_fmadd_round_ps:
477; X64:       ## %bb.0: ## %entry
478; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
479; X64-NEXT:    vfmadd132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1} ## encoding: [0x62,0xf2,0x6d,0x19,0x98,0xc1]
480; X64-NEXT:    retq ## encoding: [0xc3]
481entry:
482  %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i32 8)
483  %1 = bitcast i16 %__U to <16 x i1>
484  %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__A
485  ret <16 x float> %2
486}
487
488define <16 x float> @test_mm512_mask3_fmadd_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) {
489; X86-LABEL: test_mm512_mask3_fmadd_round_ps:
490; X86:       ## %bb.0: ## %entry
491; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
492; X86-NEXT:    vfmadd231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x19,0xb8,0xd1]
493; X86-NEXT:    vmovaps %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2]
494; X86-NEXT:    retl ## encoding: [0xc3]
495;
496; X64-LABEL: test_mm512_mask3_fmadd_round_ps:
497; X64:       ## %bb.0: ## %entry
498; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
499; X64-NEXT:    vfmadd231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x19,0xb8,0xd1]
500; X64-NEXT:    vmovaps %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2]
501; X64-NEXT:    retq ## encoding: [0xc3]
502entry:
503  %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i32 8)
504  %1 = bitcast i16 %__U to <16 x i1>
505  %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__C
506  ret <16 x float> %2
507}
508
509define <16 x float> @test_mm512_maskz_fmadd_round_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
510; X86-LABEL: test_mm512_maskz_fmadd_round_ps:
511; X86:       ## %bb.0: ## %entry
512; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
513; X86-NEXT:    vfmadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x75,0x99,0xa8,0xc2]
514; X86-NEXT:    retl ## encoding: [0xc3]
515;
516; X64-LABEL: test_mm512_maskz_fmadd_round_ps:
517; X64:       ## %bb.0: ## %entry
518; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
519; X64-NEXT:    vfmadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x75,0x99,0xa8,0xc2]
520; X64-NEXT:    retq ## encoding: [0xc3]
521entry:
522  %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i32 8)
523  %1 = bitcast i16 %__U to <16 x i1>
524  %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer
525  ret <16 x float> %2
526}
527
528define <16 x float> @test_mm512_fmsub_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
529; CHECK-LABEL: test_mm512_fmsub_round_ps:
530; CHECK:       ## %bb.0: ## %entry
531; CHECK-NEXT:    vfmsub213ps {rn-sae}, %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0x75,0x18,0xaa,0xc2]
532; CHECK-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
533entry:
534  %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
535  %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub, i32 8)
536  ret <16 x float> %0
537}
538
539define <16 x float> @test_mm512_mask_fmsub_round_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) {
540; X86-LABEL: test_mm512_mask_fmsub_round_ps:
541; X86:       ## %bb.0: ## %entry
542; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
543; X86-NEXT:    vfmsub132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1} ## encoding: [0x62,0xf2,0x6d,0x19,0x9a,0xc1]
544; X86-NEXT:    retl ## encoding: [0xc3]
545;
546; X64-LABEL: test_mm512_mask_fmsub_round_ps:
547; X64:       ## %bb.0: ## %entry
548; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
549; X64-NEXT:    vfmsub132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1} ## encoding: [0x62,0xf2,0x6d,0x19,0x9a,0xc1]
550; X64-NEXT:    retq ## encoding: [0xc3]
551entry:
552  %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
553  %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub, i32 8)
554  %1 = bitcast i16 %__U to <16 x i1>
555  %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__A
556  ret <16 x float> %2
557}
558
559define <16 x float> @test_mm512_maskz_fmsub_round_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
560; X86-LABEL: test_mm512_maskz_fmsub_round_ps:
561; X86:       ## %bb.0: ## %entry
562; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
563; X86-NEXT:    vfmsub213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x75,0x99,0xaa,0xc2]
564; X86-NEXT:    retl ## encoding: [0xc3]
565;
566; X64-LABEL: test_mm512_maskz_fmsub_round_ps:
567; X64:       ## %bb.0: ## %entry
568; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
569; X64-NEXT:    vfmsub213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x75,0x99,0xaa,0xc2]
570; X64-NEXT:    retq ## encoding: [0xc3]
571entry:
572  %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
573  %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub, i32 8)
574  %1 = bitcast i16 %__U to <16 x i1>
575  %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer
576  ret <16 x float> %2
577}
578
579define <16 x float> @test_mm512_fnmadd_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
580; CHECK-LABEL: test_mm512_fnmadd_round_ps:
581; CHECK:       ## %bb.0: ## %entry
582; CHECK-NEXT:    vfnmadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0x75,0x18,0xac,0xc2]
583; CHECK-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
584entry:
585  %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
586  %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %sub, <16 x float> %__B, <16 x float> %__C, i32 8)
587  ret <16 x float> %0
588}
589
590define <16 x float> @test_mm512_mask3_fnmadd_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) {
591; X86-LABEL: test_mm512_mask3_fnmadd_round_ps:
592; X86:       ## %bb.0: ## %entry
593; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
594; X86-NEXT:    vfnmadd231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x19,0xbc,0xd1]
595; X86-NEXT:    vmovaps %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2]
596; X86-NEXT:    retl ## encoding: [0xc3]
597;
598; X64-LABEL: test_mm512_mask3_fnmadd_round_ps:
599; X64:       ## %bb.0: ## %entry
600; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
601; X64-NEXT:    vfnmadd231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x19,0xbc,0xd1]
602; X64-NEXT:    vmovaps %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2]
603; X64-NEXT:    retq ## encoding: [0xc3]
604entry:
605  %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
606  %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %sub, <16 x float> %__B, <16 x float> %__C, i32 8)
607  %1 = bitcast i16 %__U to <16 x i1>
608  %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__C
609  ret <16 x float> %2
610}
611
612define <16 x float> @test_mm512_maskz_fnmadd_round_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
613; X86-LABEL: test_mm512_maskz_fnmadd_round_ps:
614; X86:       ## %bb.0: ## %entry
615; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
616; X86-NEXT:    vfnmadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x75,0x99,0xac,0xc2]
617; X86-NEXT:    retl ## encoding: [0xc3]
618;
619; X64-LABEL: test_mm512_maskz_fnmadd_round_ps:
620; X64:       ## %bb.0: ## %entry
621; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
622; X64-NEXT:    vfnmadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x75,0x99,0xac,0xc2]
623; X64-NEXT:    retq ## encoding: [0xc3]
624entry:
625  %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
626  %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %sub, <16 x float> %__B, <16 x float> %__C, i32 8)
627  %1 = bitcast i16 %__U to <16 x i1>
628  %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer
629  ret <16 x float> %2
630}
631
632define <16 x float> @test_mm512_fnmsub_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
633; CHECK-LABEL: test_mm512_fnmsub_round_ps:
634; CHECK:       ## %bb.0: ## %entry
635; CHECK-NEXT:    vfnmsub213ps {rn-sae}, %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0x75,0x18,0xae,0xc2]
636; CHECK-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
637entry:
638  %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
639  %sub1 = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
640  %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %sub, <16 x float> %__B, <16 x float> %sub1, i32 8)
641  ret <16 x float> %0
642}
643
644define <16 x float> @test_mm512_maskz_fnmsub_round_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
645; X86-LABEL: test_mm512_maskz_fnmsub_round_ps:
646; X86:       ## %bb.0: ## %entry
647; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
648; X86-NEXT:    vfnmsub213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x75,0x99,0xae,0xc2]
649; X86-NEXT:    retl ## encoding: [0xc3]
650;
651; X64-LABEL: test_mm512_maskz_fnmsub_round_ps:
652; X64:       ## %bb.0: ## %entry
653; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
654; X64-NEXT:    vfnmsub213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x75,0x99,0xae,0xc2]
655; X64-NEXT:    retq ## encoding: [0xc3]
656entry:
657  %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
658  %sub1 = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
659  %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %sub, <16 x float> %__B, <16 x float> %sub1, i32 8)
660  %1 = bitcast i16 %__U to <16 x i1>
661  %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer
662  ret <16 x float> %2
663}
664
665define <16 x float> @test_mm512_fmadd_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
666; CHECK-LABEL: test_mm512_fmadd_ps:
667; CHECK:       ## %bb.0: ## %entry
668; CHECK-NEXT:    vfmadd213ps %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0x75,0x48,0xa8,0xc2]
669; CHECK-NEXT:    ## zmm0 = (zmm1 * zmm0) + zmm2
670; CHECK-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
671entry:
672  %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) #10
673  ret <16 x float> %0
674}
675
676define <16 x float> @test_mm512_mask_fmadd_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) {
677; X86-LABEL: test_mm512_mask_fmadd_ps:
678; X86:       ## %bb.0: ## %entry
679; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
680; X86-NEXT:    vfmadd132ps %zmm1, %zmm2, %zmm0 {%k1} ## encoding: [0x62,0xf2,0x6d,0x49,0x98,0xc1]
681; X86-NEXT:    ## zmm0 {%k1} = (zmm0 * zmm1) + zmm2
682; X86-NEXT:    retl ## encoding: [0xc3]
683;
684; X64-LABEL: test_mm512_mask_fmadd_ps:
685; X64:       ## %bb.0: ## %entry
686; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
687; X64-NEXT:    vfmadd132ps %zmm1, %zmm2, %zmm0 {%k1} ## encoding: [0x62,0xf2,0x6d,0x49,0x98,0xc1]
688; X64-NEXT:    ## zmm0 {%k1} = (zmm0 * zmm1) + zmm2
689; X64-NEXT:    retq ## encoding: [0xc3]
690entry:
691  %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) #10
692  %1 = bitcast i16 %__U to <16 x i1>
693  %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__A
694  ret <16 x float> %2
695}
696
697define <16 x float> @test_mm512_mask3_fmadd_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) {
698; X86-LABEL: test_mm512_mask3_fmadd_ps:
699; X86:       ## %bb.0: ## %entry
700; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
701; X86-NEXT:    vfmadd231ps %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0xb8,0xd1]
702; X86-NEXT:    ## zmm2 {%k1} = (zmm0 * zmm1) + zmm2
703; X86-NEXT:    vmovaps %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2]
704; X86-NEXT:    retl ## encoding: [0xc3]
705;
706; X64-LABEL: test_mm512_mask3_fmadd_ps:
707; X64:       ## %bb.0: ## %entry
708; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
709; X64-NEXT:    vfmadd231ps %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0xb8,0xd1]
710; X64-NEXT:    ## zmm2 {%k1} = (zmm0 * zmm1) + zmm2
711; X64-NEXT:    vmovaps %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2]
712; X64-NEXT:    retq ## encoding: [0xc3]
713entry:
714  %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) #10
715  %1 = bitcast i16 %__U to <16 x i1>
716  %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__C
717  ret <16 x float> %2
718}
719
720define <16 x float> @test_mm512_maskz_fmadd_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
721; X86-LABEL: test_mm512_maskz_fmadd_ps:
722; X86:       ## %bb.0: ## %entry
723; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
724; X86-NEXT:    vfmadd213ps %zmm2, %zmm1, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x75,0xc9,0xa8,0xc2]
725; X86-NEXT:    ## zmm0 {%k1} {z} = (zmm1 * zmm0) + zmm2
726; X86-NEXT:    retl ## encoding: [0xc3]
727;
728; X64-LABEL: test_mm512_maskz_fmadd_ps:
729; X64:       ## %bb.0: ## %entry
730; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
731; X64-NEXT:    vfmadd213ps %zmm2, %zmm1, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x75,0xc9,0xa8,0xc2]
732; X64-NEXT:    ## zmm0 {%k1} {z} = (zmm1 * zmm0) + zmm2
733; X64-NEXT:    retq ## encoding: [0xc3]
734entry:
735  %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) #10
736  %1 = bitcast i16 %__U to <16 x i1>
737  %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer
738  ret <16 x float> %2
739}
740
741define <16 x float> @test_mm512_fmsub_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
742; CHECK-LABEL: test_mm512_fmsub_ps:
743; CHECK:       ## %bb.0: ## %entry
744; CHECK-NEXT:    vfmsub213ps %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0x75,0x48,0xaa,0xc2]
745; CHECK-NEXT:    ## zmm0 = (zmm1 * zmm0) - zmm2
746; CHECK-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
747entry:
748  %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
749  %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub.i) #10
750  ret <16 x float> %0
751}
752
753define <16 x float> @test_mm512_mask_fmsub_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) {
754; X86-LABEL: test_mm512_mask_fmsub_ps:
755; X86:       ## %bb.0: ## %entry
756; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
757; X86-NEXT:    vfmsub132ps %zmm1, %zmm2, %zmm0 {%k1} ## encoding: [0x62,0xf2,0x6d,0x49,0x9a,0xc1]
758; X86-NEXT:    ## zmm0 {%k1} = (zmm0 * zmm1) - zmm2
759; X86-NEXT:    retl ## encoding: [0xc3]
760;
761; X64-LABEL: test_mm512_mask_fmsub_ps:
762; X64:       ## %bb.0: ## %entry
763; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
764; X64-NEXT:    vfmsub132ps %zmm1, %zmm2, %zmm0 {%k1} ## encoding: [0x62,0xf2,0x6d,0x49,0x9a,0xc1]
765; X64-NEXT:    ## zmm0 {%k1} = (zmm0 * zmm1) - zmm2
766; X64-NEXT:    retq ## encoding: [0xc3]
767entry:
768  %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
769  %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub.i) #10
770  %1 = bitcast i16 %__U to <16 x i1>
771  %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__A
772  ret <16 x float> %2
773}
774
775define <16 x float> @test_mm512_maskz_fmsub_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
776; X86-LABEL: test_mm512_maskz_fmsub_ps:
777; X86:       ## %bb.0: ## %entry
778; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
779; X86-NEXT:    vfmsub213ps %zmm2, %zmm1, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x75,0xc9,0xaa,0xc2]
780; X86-NEXT:    ## zmm0 {%k1} {z} = (zmm1 * zmm0) - zmm2
781; X86-NEXT:    retl ## encoding: [0xc3]
782;
783; X64-LABEL: test_mm512_maskz_fmsub_ps:
784; X64:       ## %bb.0: ## %entry
785; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
786; X64-NEXT:    vfmsub213ps %zmm2, %zmm1, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x75,0xc9,0xaa,0xc2]
787; X64-NEXT:    ## zmm0 {%k1} {z} = (zmm1 * zmm0) - zmm2
788; X64-NEXT:    retq ## encoding: [0xc3]
789entry:
790  %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
791  %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub.i) #10
792  %1 = bitcast i16 %__U to <16 x i1>
793  %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer
794  ret <16 x float> %2
795}
796
797define <16 x float> @test_mm512_fnmadd_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
798; CHECK-LABEL: test_mm512_fnmadd_ps:
799; CHECK:       ## %bb.0: ## %entry
800; CHECK-NEXT:    vfnmadd213ps %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0x75,0x48,0xac,0xc2]
801; CHECK-NEXT:    ## zmm0 = -(zmm1 * zmm0) + zmm2
802; CHECK-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
803entry:
804  %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
805  %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %sub.i, <16 x float> %__B, <16 x float> %__C) #10
806  ret <16 x float> %0
807}
808
809define <16 x float> @test_mm512_mask3_fnmadd_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) {
810; X86-LABEL: test_mm512_mask3_fnmadd_ps:
811; X86:       ## %bb.0: ## %entry
812; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
813; X86-NEXT:    vfnmadd231ps %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0xbc,0xd1]
814; X86-NEXT:    ## zmm2 {%k1} = -(zmm0 * zmm1) + zmm2
815; X86-NEXT:    vmovaps %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2]
816; X86-NEXT:    retl ## encoding: [0xc3]
817;
818; X64-LABEL: test_mm512_mask3_fnmadd_ps:
819; X64:       ## %bb.0: ## %entry
820; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
821; X64-NEXT:    vfnmadd231ps %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0xbc,0xd1]
822; X64-NEXT:    ## zmm2 {%k1} = -(zmm0 * zmm1) + zmm2
823; X64-NEXT:    vmovaps %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2]
824; X64-NEXT:    retq ## encoding: [0xc3]
825entry:
826  %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
827  %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %sub.i, <16 x float> %__B, <16 x float> %__C) #10
828  %1 = bitcast i16 %__U to <16 x i1>
829  %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__C
830  ret <16 x float> %2
831}
832
833define <16 x float> @test_mm512_maskz_fnmadd_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
834; X86-LABEL: test_mm512_maskz_fnmadd_ps:
835; X86:       ## %bb.0: ## %entry
836; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
837; X86-NEXT:    vfnmadd213ps %zmm2, %zmm1, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x75,0xc9,0xac,0xc2]
838; X86-NEXT:    ## zmm0 {%k1} {z} = -(zmm1 * zmm0) + zmm2
839; X86-NEXT:    retl ## encoding: [0xc3]
840;
841; X64-LABEL: test_mm512_maskz_fnmadd_ps:
842; X64:       ## %bb.0: ## %entry
843; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
844; X64-NEXT:    vfnmadd213ps %zmm2, %zmm1, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x75,0xc9,0xac,0xc2]
845; X64-NEXT:    ## zmm0 {%k1} {z} = -(zmm1 * zmm0) + zmm2
846; X64-NEXT:    retq ## encoding: [0xc3]
847entry:
848  %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
849  %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %sub.i, <16 x float> %__B, <16 x float> %__C) #10
850  %1 = bitcast i16 %__U to <16 x i1>
851  %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer
852  ret <16 x float> %2
853}
854
855define <16 x float> @test_mm512_fnmsub_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
856; CHECK-LABEL: test_mm512_fnmsub_ps:
857; CHECK:       ## %bb.0: ## %entry
858; CHECK-NEXT:    vfnmsub213ps %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0x75,0x48,0xae,0xc2]
859; CHECK-NEXT:    ## zmm0 = -(zmm1 * zmm0) - zmm2
860; CHECK-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
861entry:
862  %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
863  %sub1.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
864  %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %sub.i, <16 x float> %__B, <16 x float> %sub1.i) #10
865  ret <16 x float> %0
866}
867
868define <16 x float> @test_mm512_maskz_fnmsub_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
869; X86-LABEL: test_mm512_maskz_fnmsub_ps:
870; X86:       ## %bb.0: ## %entry
871; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
872; X86-NEXT:    vfnmsub213ps %zmm2, %zmm1, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x75,0xc9,0xae,0xc2]
873; X86-NEXT:    ## zmm0 {%k1} {z} = -(zmm1 * zmm0) - zmm2
874; X86-NEXT:    retl ## encoding: [0xc3]
875;
876; X64-LABEL: test_mm512_maskz_fnmsub_ps:
877; X64:       ## %bb.0: ## %entry
878; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
879; X64-NEXT:    vfnmsub213ps %zmm2, %zmm1, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x75,0xc9,0xae,0xc2]
880; X64-NEXT:    ## zmm0 {%k1} {z} = -(zmm1 * zmm0) - zmm2
881; X64-NEXT:    retq ## encoding: [0xc3]
882entry:
883  %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
884  %sub1.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
885  %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %sub.i, <16 x float> %__B, <16 x float> %sub1.i) #10
886  %1 = bitcast i16 %__U to <16 x i1>
887  %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer
888  ret <16 x float> %2
889}
890
891define <8 x double> @test_mm512_fmaddsub_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
892; CHECK-LABEL: test_mm512_fmaddsub_round_pd:
893; CHECK:       ## %bb.0: ## %entry
894; CHECK-NEXT:    vfmaddsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0xf5,0x18,0xa6,0xc2]
895; CHECK-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
896entry:
897  %0 = tail call <8 x double> @llvm.x86.avx512.vfmaddsub.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i32 8)
898  ret <8 x double> %0
899}
900
901declare <8 x double> @llvm.x86.avx512.vfmaddsub.pd.512(<8 x double>, <8 x double>, <8 x double>, i32) #1
902
903define <8 x double> @test_mm512_mask_fmaddsub_round_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) {
904; X86-LABEL: test_mm512_mask_fmaddsub_round_pd:
905; X86:       ## %bb.0: ## %entry
906; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04]
907; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
908; X86-NEXT:    vfmaddsub132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1} ## encoding: [0x62,0xf2,0xed,0x19,0x96,0xc1]
909; X86-NEXT:    retl ## encoding: [0xc3]
910;
911; X64-LABEL: test_mm512_mask_fmaddsub_round_pd:
912; X64:       ## %bb.0: ## %entry
913; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
914; X64-NEXT:    vfmaddsub132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1} ## encoding: [0x62,0xf2,0xed,0x19,0x96,0xc1]
915; X64-NEXT:    retq ## encoding: [0xc3]
916entry:
917  %0 = tail call <8 x double> @llvm.x86.avx512.vfmaddsub.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i32 8)
918  %1 = bitcast i8 %__U to <8 x i1>
919  %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__A
920  ret <8 x double> %2
921}
922
923define <8 x double> @test_mm512_mask3_fmaddsub_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) {
924; X86-LABEL: test_mm512_mask3_fmaddsub_round_pd:
925; X86:       ## %bb.0: ## %entry
926; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04]
927; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
928; X86-NEXT:    vfmaddsub231pd {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x19,0xb6,0xd1]
929; X86-NEXT:    vmovapd %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x28,0xc2]
930; X86-NEXT:    retl ## encoding: [0xc3]
931;
932; X64-LABEL: test_mm512_mask3_fmaddsub_round_pd:
933; X64:       ## %bb.0: ## %entry
934; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
935; X64-NEXT:    vfmaddsub231pd {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x19,0xb6,0xd1]
936; X64-NEXT:    vmovapd %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x28,0xc2]
937; X64-NEXT:    retq ## encoding: [0xc3]
938entry:
939  %0 = tail call <8 x double> @llvm.x86.avx512.vfmaddsub.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i32 8)
940  %1 = bitcast i8 %__U to <8 x i1>
941  %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__C
942  ret <8 x double> %2
943}
944
945define <8 x double> @test_mm512_maskz_fmaddsub_round_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
946; X86-LABEL: test_mm512_maskz_fmaddsub_round_pd:
947; X86:       ## %bb.0: ## %entry
948; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04]
949; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
950; X86-NEXT:    vfmaddsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xf5,0x99,0xa6,0xc2]
951; X86-NEXT:    retl ## encoding: [0xc3]
952;
953; X64-LABEL: test_mm512_maskz_fmaddsub_round_pd:
954; X64:       ## %bb.0: ## %entry
955; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
956; X64-NEXT:    vfmaddsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xf5,0x99,0xa6,0xc2]
957; X64-NEXT:    retq ## encoding: [0xc3]
958entry:
959  %0 = tail call <8 x double> @llvm.x86.avx512.vfmaddsub.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i32 8)
960  %1 = bitcast i8 %__U to <8 x i1>
961  %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer
962  ret <8 x double> %2
963}
964
965define <8 x double> @test_mm512_fmsubadd_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
966; CHECK-LABEL: test_mm512_fmsubadd_round_pd:
967; CHECK:       ## %bb.0: ## %entry
968; CHECK-NEXT:    vfmsubadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0xf5,0x18,0xa7,0xc2]
969; CHECK-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
970entry:
971  %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
972  %0 = tail call <8 x double> @llvm.x86.avx512.vfmaddsub.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub, i32 8)
973  ret <8 x double> %0
974}
975
976define <8 x double> @test_mm512_mask_fmsubadd_round_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) {
977; X86-LABEL: test_mm512_mask_fmsubadd_round_pd:
978; X86:       ## %bb.0: ## %entry
979; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04]
980; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
981; X86-NEXT:    vfmsubadd132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1} ## encoding: [0x62,0xf2,0xed,0x19,0x97,0xc1]
982; X86-NEXT:    retl ## encoding: [0xc3]
983;
984; X64-LABEL: test_mm512_mask_fmsubadd_round_pd:
985; X64:       ## %bb.0: ## %entry
986; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
987; X64-NEXT:    vfmsubadd132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1} ## encoding: [0x62,0xf2,0xed,0x19,0x97,0xc1]
988; X64-NEXT:    retq ## encoding: [0xc3]
989entry:
990  %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
991  %0 = tail call <8 x double> @llvm.x86.avx512.vfmaddsub.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub, i32 8)
992  %1 = bitcast i8 %__U to <8 x i1>
993  %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__A
994  ret <8 x double> %2
995}
996
997define <8 x double> @test_mm512_maskz_fmsubadd_round_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
998; X86-LABEL: test_mm512_maskz_fmsubadd_round_pd:
999; X86:       ## %bb.0: ## %entry
1000; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04]
1001; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
1002; X86-NEXT:    vfmsubadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xf5,0x99,0xa7,0xc2]
1003; X86-NEXT:    retl ## encoding: [0xc3]
1004;
1005; X64-LABEL: test_mm512_maskz_fmsubadd_round_pd:
1006; X64:       ## %bb.0: ## %entry
1007; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
1008; X64-NEXT:    vfmsubadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xf5,0x99,0xa7,0xc2]
1009; X64-NEXT:    retq ## encoding: [0xc3]
1010entry:
1011  %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
1012  %0 = tail call <8 x double> @llvm.x86.avx512.vfmaddsub.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub, i32 8)
1013  %1 = bitcast i8 %__U to <8 x i1>
1014  %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer
1015  ret <8 x double> %2
1016}
1017
1018define <8 x double> @test_mm512_fmaddsub_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
1019; CHECK-LABEL: test_mm512_fmaddsub_pd:
1020; CHECK:       ## %bb.0: ## %entry
1021; CHECK-NEXT:    vfmaddsub213pd %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0xf5,0x48,0xa6,0xc2]
1022; CHECK-NEXT:    ## zmm0 = (zmm1 * zmm0) +/- zmm2
1023; CHECK-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
1024entry:
1025  %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) #10
1026  %1 = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
1027  %2 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %1) #10
1028  %3 = shufflevector <8 x double> %2, <8 x double> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
1029  ret <8 x double> %3
1030}
1031
1032define <8 x double> @test_mm512_mask_fmaddsub_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) {
1033; X86-LABEL: test_mm512_mask_fmaddsub_pd:
1034; X86:       ## %bb.0: ## %entry
1035; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04]
1036; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
1037; X86-NEXT:    vfmaddsub132pd %zmm1, %zmm2, %zmm0 {%k1} ## encoding: [0x62,0xf2,0xed,0x49,0x96,0xc1]
1038; X86-NEXT:    ## zmm0 {%k1} = (zmm0 * zmm1) +/- zmm2
1039; X86-NEXT:    retl ## encoding: [0xc3]
1040;
1041; X64-LABEL: test_mm512_mask_fmaddsub_pd:
1042; X64:       ## %bb.0: ## %entry
1043; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
1044; X64-NEXT:    vfmaddsub132pd %zmm1, %zmm2, %zmm0 {%k1} ## encoding: [0x62,0xf2,0xed,0x49,0x96,0xc1]
1045; X64-NEXT:    ## zmm0 {%k1} = (zmm0 * zmm1) +/- zmm2
1046; X64-NEXT:    retq ## encoding: [0xc3]
1047entry:
1048  %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) #10
1049  %1 = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
1050  %2 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %1) #10
1051  %3 = shufflevector <8 x double> %2, <8 x double> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
1052  %4 = bitcast i8 %__U to <8 x i1>
1053  %5 = select <8 x i1> %4, <8 x double> %3, <8 x double> %__A
1054  ret <8 x double> %5
1055}
1056
1057define <8 x double> @test_mm512_mask3_fmaddsub_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) {
1058; X86-LABEL: test_mm512_mask3_fmaddsub_pd:
1059; X86:       ## %bb.0: ## %entry
1060; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04]
1061; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
1062; X86-NEXT:    vfmaddsub231pd %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0xb6,0xd1]
1063; X86-NEXT:    ## zmm2 {%k1} = (zmm0 * zmm1) +/- zmm2
1064; X86-NEXT:    vmovapd %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x28,0xc2]
1065; X86-NEXT:    retl ## encoding: [0xc3]
1066;
1067; X64-LABEL: test_mm512_mask3_fmaddsub_pd:
1068; X64:       ## %bb.0: ## %entry
1069; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
1070; X64-NEXT:    vfmaddsub231pd %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0xb6,0xd1]
1071; X64-NEXT:    ## zmm2 {%k1} = (zmm0 * zmm1) +/- zmm2
1072; X64-NEXT:    vmovapd %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x28,0xc2]
1073; X64-NEXT:    retq ## encoding: [0xc3]
1074entry:
1075  %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) #10
1076  %1 = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
1077  %2 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %1) #10
1078  %3 = shufflevector <8 x double> %2, <8 x double> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
1079  %4 = bitcast i8 %__U to <8 x i1>
1080  %5 = select <8 x i1> %4, <8 x double> %3, <8 x double> %__C
1081  ret <8 x double> %5
1082}
1083
1084define <8 x double> @test_mm512_maskz_fmaddsub_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
1085; X86-LABEL: test_mm512_maskz_fmaddsub_pd:
1086; X86:       ## %bb.0: ## %entry
1087; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04]
1088; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
1089; X86-NEXT:    vfmaddsub213pd %zmm2, %zmm1, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xf5,0xc9,0xa6,0xc2]
1090; X86-NEXT:    ## zmm0 {%k1} {z} = (zmm1 * zmm0) +/- zmm2
1091; X86-NEXT:    retl ## encoding: [0xc3]
1092;
1093; X64-LABEL: test_mm512_maskz_fmaddsub_pd:
1094; X64:       ## %bb.0: ## %entry
1095; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
1096; X64-NEXT:    vfmaddsub213pd %zmm2, %zmm1, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xf5,0xc9,0xa6,0xc2]
1097; X64-NEXT:    ## zmm0 {%k1} {z} = (zmm1 * zmm0) +/- zmm2
1098; X64-NEXT:    retq ## encoding: [0xc3]
1099entry:
1100  %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) #10
1101  %1 = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
1102  %2 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %1) #10
1103  %3 = shufflevector <8 x double> %2, <8 x double> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
1104  %4 = bitcast i8 %__U to <8 x i1>
1105  %5 = select <8 x i1> %4, <8 x double> %3, <8 x double> zeroinitializer
1106  ret <8 x double> %5
1107}
1108
1109define <8 x double> @test_mm512_fmsubadd_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
1110; CHECK-LABEL: test_mm512_fmsubadd_pd:
1111; CHECK:       ## %bb.0: ## %entry
1112; CHECK-NEXT:    vfmsubadd213pd %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0xf5,0x48,0xa7,0xc2]
1113; CHECK-NEXT:    ## zmm0 = (zmm1 * zmm0) -/+ zmm2
1114; CHECK-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
1115entry:
1116  %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
1117  %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub.i) #10
1118  %1 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) #10
1119  %2 = shufflevector <8 x double> %1, <8 x double> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
1120  ret <8 x double> %2
1121}
1122
1123define <8 x double> @test_mm512_mask_fmsubadd_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) {
1124; X86-LABEL: test_mm512_mask_fmsubadd_pd:
1125; X86:       ## %bb.0: ## %entry
1126; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04]
1127; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
1128; X86-NEXT:    vfmsubadd132pd %zmm1, %zmm2, %zmm0 {%k1} ## encoding: [0x62,0xf2,0xed,0x49,0x97,0xc1]
1129; X86-NEXT:    ## zmm0 {%k1} = (zmm0 * zmm1) -/+ zmm2
1130; X86-NEXT:    retl ## encoding: [0xc3]
1131;
1132; X64-LABEL: test_mm512_mask_fmsubadd_pd:
1133; X64:       ## %bb.0: ## %entry
1134; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
1135; X64-NEXT:    vfmsubadd132pd %zmm1, %zmm2, %zmm0 {%k1} ## encoding: [0x62,0xf2,0xed,0x49,0x97,0xc1]
1136; X64-NEXT:    ## zmm0 {%k1} = (zmm0 * zmm1) -/+ zmm2
1137; X64-NEXT:    retq ## encoding: [0xc3]
1138entry:
1139  %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
1140  %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub.i) #10
1141  %1 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) #10
1142  %2 = shufflevector <8 x double> %1, <8 x double> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
1143  %3 = bitcast i8 %__U to <8 x i1>
1144  %4 = select <8 x i1> %3, <8 x double> %2, <8 x double> %__A
1145  ret <8 x double> %4
1146}
1147
1148define <8 x double> @test_mm512_maskz_fmsubadd_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
1149; X86-LABEL: test_mm512_maskz_fmsubadd_pd:
1150; X86:       ## %bb.0: ## %entry
1151; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04]
1152; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
1153; X86-NEXT:    vfmsubadd213pd %zmm2, %zmm1, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xf5,0xc9,0xa7,0xc2]
1154; X86-NEXT:    ## zmm0 {%k1} {z} = (zmm1 * zmm0) -/+ zmm2
1155; X86-NEXT:    retl ## encoding: [0xc3]
1156;
1157; X64-LABEL: test_mm512_maskz_fmsubadd_pd:
1158; X64:       ## %bb.0: ## %entry
1159; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
1160; X64-NEXT:    vfmsubadd213pd %zmm2, %zmm1, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xf5,0xc9,0xa7,0xc2]
1161; X64-NEXT:    ## zmm0 {%k1} {z} = (zmm1 * zmm0) -/+ zmm2
1162; X64-NEXT:    retq ## encoding: [0xc3]
1163entry:
1164  %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
1165  %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub.i) #10
1166  %1 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) #10
1167  %2 = shufflevector <8 x double> %1, <8 x double> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
1168  %3 = bitcast i8 %__U to <8 x i1>
1169  %4 = select <8 x i1> %3, <8 x double> %2, <8 x double> zeroinitializer
1170  ret <8 x double> %4
1171}
1172
1173define <16 x float> @test_mm512_fmaddsub_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
1174; CHECK-LABEL: test_mm512_fmaddsub_round_ps:
1175; CHECK:       ## %bb.0: ## %entry
1176; CHECK-NEXT:    vfmaddsub213ps {rn-sae}, %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0x75,0x18,0xa6,0xc2]
1177; CHECK-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
1178entry:
1179  %0 = tail call <16 x float> @llvm.x86.avx512.vfmaddsub.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i32 8)
1180  ret <16 x float> %0
1181}
1182
1183declare <16 x float> @llvm.x86.avx512.vfmaddsub.ps.512(<16 x float>, <16 x float>, <16 x float>, i32) #1
1184
1185define <16 x float> @test_mm512_mask_fmaddsub_round_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) {
1186; X86-LABEL: test_mm512_mask_fmaddsub_round_ps:
1187; X86:       ## %bb.0: ## %entry
1188; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
1189; X86-NEXT:    vfmaddsub132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1} ## encoding: [0x62,0xf2,0x6d,0x19,0x96,0xc1]
1190; X86-NEXT:    retl ## encoding: [0xc3]
1191;
1192; X64-LABEL: test_mm512_mask_fmaddsub_round_ps:
1193; X64:       ## %bb.0: ## %entry
1194; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
1195; X64-NEXT:    vfmaddsub132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1} ## encoding: [0x62,0xf2,0x6d,0x19,0x96,0xc1]
1196; X64-NEXT:    retq ## encoding: [0xc3]
1197entry:
1198  %0 = tail call <16 x float> @llvm.x86.avx512.vfmaddsub.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i32 8)
1199  %1 = bitcast i16 %__U to <16 x i1>
1200  %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__A
1201  ret <16 x float> %2
1202}
1203
1204define <16 x float> @test_mm512_mask3_fmaddsub_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) {
1205; X86-LABEL: test_mm512_mask3_fmaddsub_round_ps:
1206; X86:       ## %bb.0: ## %entry
1207; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
1208; X86-NEXT:    vfmaddsub231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x19,0xb6,0xd1]
1209; X86-NEXT:    vmovaps %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2]
1210; X86-NEXT:    retl ## encoding: [0xc3]
1211;
1212; X64-LABEL: test_mm512_mask3_fmaddsub_round_ps:
1213; X64:       ## %bb.0: ## %entry
1214; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
1215; X64-NEXT:    vfmaddsub231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x19,0xb6,0xd1]
1216; X64-NEXT:    vmovaps %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2]
1217; X64-NEXT:    retq ## encoding: [0xc3]
1218entry:
1219  %0 = tail call <16 x float> @llvm.x86.avx512.vfmaddsub.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i32 8)
1220  %1 = bitcast i16 %__U to <16 x i1>
1221  %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__C
1222  ret <16 x float> %2
1223}
1224
1225define <16 x float> @test_mm512_maskz_fmaddsub_round_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
1226; X86-LABEL: test_mm512_maskz_fmaddsub_round_ps:
1227; X86:       ## %bb.0: ## %entry
1228; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
1229; X86-NEXT:    vfmaddsub213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x75,0x99,0xa6,0xc2]
1230; X86-NEXT:    retl ## encoding: [0xc3]
1231;
1232; X64-LABEL: test_mm512_maskz_fmaddsub_round_ps:
1233; X64:       ## %bb.0: ## %entry
1234; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
1235; X64-NEXT:    vfmaddsub213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x75,0x99,0xa6,0xc2]
1236; X64-NEXT:    retq ## encoding: [0xc3]
1237entry:
1238  %0 = tail call <16 x float> @llvm.x86.avx512.vfmaddsub.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i32 8)
1239  %1 = bitcast i16 %__U to <16 x i1>
1240  %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer
1241  ret <16 x float> %2
1242}
1243
1244define <16 x float> @test_mm512_fmsubadd_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
1245; CHECK-LABEL: test_mm512_fmsubadd_round_ps:
1246; CHECK:       ## %bb.0: ## %entry
1247; CHECK-NEXT:    vfmsubadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0x75,0x18,0xa7,0xc2]
1248; CHECK-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
1249entry:
1250  %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
1251  %0 = tail call <16 x float> @llvm.x86.avx512.vfmaddsub.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub, i32 8)
1252  ret <16 x float> %0
1253}
1254
1255define <16 x float> @test_mm512_mask_fmsubadd_round_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) {
1256; X86-LABEL: test_mm512_mask_fmsubadd_round_ps:
1257; X86:       ## %bb.0: ## %entry
1258; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
1259; X86-NEXT:    vfmsubadd132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1} ## encoding: [0x62,0xf2,0x6d,0x19,0x97,0xc1]
1260; X86-NEXT:    retl ## encoding: [0xc3]
1261;
1262; X64-LABEL: test_mm512_mask_fmsubadd_round_ps:
1263; X64:       ## %bb.0: ## %entry
1264; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
1265; X64-NEXT:    vfmsubadd132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1} ## encoding: [0x62,0xf2,0x6d,0x19,0x97,0xc1]
1266; X64-NEXT:    retq ## encoding: [0xc3]
1267entry:
1268  %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
1269  %0 = tail call <16 x float> @llvm.x86.avx512.vfmaddsub.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub, i32 8)
1270  %1 = bitcast i16 %__U to <16 x i1>
1271  %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__A
1272  ret <16 x float> %2
1273}
1274
1275define <16 x float> @test_mm512_maskz_fmsubadd_round_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
1276; X86-LABEL: test_mm512_maskz_fmsubadd_round_ps:
1277; X86:       ## %bb.0: ## %entry
1278; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
1279; X86-NEXT:    vfmsubadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x75,0x99,0xa7,0xc2]
1280; X86-NEXT:    retl ## encoding: [0xc3]
1281;
1282; X64-LABEL: test_mm512_maskz_fmsubadd_round_ps:
1283; X64:       ## %bb.0: ## %entry
1284; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
1285; X64-NEXT:    vfmsubadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x75,0x99,0xa7,0xc2]
1286; X64-NEXT:    retq ## encoding: [0xc3]
1287entry:
1288  %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
1289  %0 = tail call <16 x float> @llvm.x86.avx512.vfmaddsub.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub, i32 8)
1290  %1 = bitcast i16 %__U to <16 x i1>
1291  %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer
1292  ret <16 x float> %2
1293}
1294
1295define <16 x float> @test_mm512_fmaddsub_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
1296; CHECK-LABEL: test_mm512_fmaddsub_ps:
1297; CHECK:       ## %bb.0: ## %entry
1298; CHECK-NEXT:    vfmaddsub213ps %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0x75,0x48,0xa6,0xc2]
1299; CHECK-NEXT:    ## zmm0 = (zmm1 * zmm0) +/- zmm2
1300; CHECK-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
1301entry:
1302  %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) #10
1303  %1 = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
1304  %2 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %1) #10
1305  %3 = shufflevector <16 x float> %2, <16 x float> %0, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
1306  ret <16 x float> %3
1307}
1308
1309define <16 x float> @test_mm512_mask_fmaddsub_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) {
1310; X86-LABEL: test_mm512_mask_fmaddsub_ps:
1311; X86:       ## %bb.0: ## %entry
1312; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
1313; X86-NEXT:    vfmaddsub132ps %zmm1, %zmm2, %zmm0 {%k1} ## encoding: [0x62,0xf2,0x6d,0x49,0x96,0xc1]
1314; X86-NEXT:    ## zmm0 {%k1} = (zmm0 * zmm1) +/- zmm2
1315; X86-NEXT:    retl ## encoding: [0xc3]
1316;
1317; X64-LABEL: test_mm512_mask_fmaddsub_ps:
1318; X64:       ## %bb.0: ## %entry
1319; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
1320; X64-NEXT:    vfmaddsub132ps %zmm1, %zmm2, %zmm0 {%k1} ## encoding: [0x62,0xf2,0x6d,0x49,0x96,0xc1]
1321; X64-NEXT:    ## zmm0 {%k1} = (zmm0 * zmm1) +/- zmm2
1322; X64-NEXT:    retq ## encoding: [0xc3]
1323entry:
1324  %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) #10
1325  %1 = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
1326  %2 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %1) #10
1327  %3 = shufflevector <16 x float> %2, <16 x float> %0, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
1328  %4 = bitcast i16 %__U to <16 x i1>
1329  %5 = select <16 x i1> %4, <16 x float> %3, <16 x float> %__A
1330  ret <16 x float> %5
1331}
1332
1333define <16 x float> @test_mm512_mask3_fmaddsub_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) {
1334; X86-LABEL: test_mm512_mask3_fmaddsub_ps:
1335; X86:       ## %bb.0: ## %entry
1336; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
1337; X86-NEXT:    vfmaddsub231ps %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0xb6,0xd1]
1338; X86-NEXT:    ## zmm2 {%k1} = (zmm0 * zmm1) +/- zmm2
1339; X86-NEXT:    vmovaps %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2]
1340; X86-NEXT:    retl ## encoding: [0xc3]
1341;
1342; X64-LABEL: test_mm512_mask3_fmaddsub_ps:
1343; X64:       ## %bb.0: ## %entry
1344; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
1345; X64-NEXT:    vfmaddsub231ps %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0xb6,0xd1]
1346; X64-NEXT:    ## zmm2 {%k1} = (zmm0 * zmm1) +/- zmm2
1347; X64-NEXT:    vmovaps %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2]
1348; X64-NEXT:    retq ## encoding: [0xc3]
1349entry:
1350  %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) #10
1351  %1 = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
1352  %2 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %1) #10
1353  %3 = shufflevector <16 x float> %2, <16 x float> %0, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
1354  %4 = bitcast i16 %__U to <16 x i1>
1355  %5 = select <16 x i1> %4, <16 x float> %3, <16 x float> %__C
1356  ret <16 x float> %5
1357}
1358
1359define <16 x float> @test_mm512_maskz_fmaddsub_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
1360; X86-LABEL: test_mm512_maskz_fmaddsub_ps:
1361; X86:       ## %bb.0: ## %entry
1362; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
1363; X86-NEXT:    vfmaddsub213ps %zmm2, %zmm1, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x75,0xc9,0xa6,0xc2]
1364; X86-NEXT:    ## zmm0 {%k1} {z} = (zmm1 * zmm0) +/- zmm2
1365; X86-NEXT:    retl ## encoding: [0xc3]
1366;
1367; X64-LABEL: test_mm512_maskz_fmaddsub_ps:
1368; X64:       ## %bb.0: ## %entry
1369; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
1370; X64-NEXT:    vfmaddsub213ps %zmm2, %zmm1, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x75,0xc9,0xa6,0xc2]
1371; X64-NEXT:    ## zmm0 {%k1} {z} = (zmm1 * zmm0) +/- zmm2
1372; X64-NEXT:    retq ## encoding: [0xc3]
1373entry:
1374  %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) #10
1375  %1 = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
1376  %2 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %1) #10
1377  %3 = shufflevector <16 x float> %2, <16 x float> %0, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
1378  %4 = bitcast i16 %__U to <16 x i1>
1379  %5 = select <16 x i1> %4, <16 x float> %3, <16 x float> zeroinitializer
1380  ret <16 x float> %5
1381}
1382
1383define <16 x float> @test_mm512_fmsubadd_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
1384; CHECK-LABEL: test_mm512_fmsubadd_ps:
1385; CHECK:       ## %bb.0: ## %entry
1386; CHECK-NEXT:    vfmsubadd213ps %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0x75,0x48,0xa7,0xc2]
1387; CHECK-NEXT:    ## zmm0 = (zmm1 * zmm0) -/+ zmm2
1388; CHECK-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
1389entry:
1390  %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
1391  %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub.i) #10
1392  %1 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) #10
1393  %2 = shufflevector <16 x float> %1, <16 x float> %0, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
1394  ret <16 x float> %2
1395}
1396
1397define <16 x float> @test_mm512_mask_fmsubadd_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) {
1398; X86-LABEL: test_mm512_mask_fmsubadd_ps:
1399; X86:       ## %bb.0: ## %entry
1400; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
1401; X86-NEXT:    vfmsubadd132ps %zmm1, %zmm2, %zmm0 {%k1} ## encoding: [0x62,0xf2,0x6d,0x49,0x97,0xc1]
1402; X86-NEXT:    ## zmm0 {%k1} = (zmm0 * zmm1) -/+ zmm2
1403; X86-NEXT:    retl ## encoding: [0xc3]
1404;
1405; X64-LABEL: test_mm512_mask_fmsubadd_ps:
1406; X64:       ## %bb.0: ## %entry
1407; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
1408; X64-NEXT:    vfmsubadd132ps %zmm1, %zmm2, %zmm0 {%k1} ## encoding: [0x62,0xf2,0x6d,0x49,0x97,0xc1]
1409; X64-NEXT:    ## zmm0 {%k1} = (zmm0 * zmm1) -/+ zmm2
1410; X64-NEXT:    retq ## encoding: [0xc3]
1411entry:
1412  %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
1413  %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub.i) #10
1414  %1 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) #10
1415  %2 = shufflevector <16 x float> %1, <16 x float> %0, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
1416  %3 = bitcast i16 %__U to <16 x i1>
1417  %4 = select <16 x i1> %3, <16 x float> %2, <16 x float> %__A
1418  ret <16 x float> %4
1419}
1420
1421define <16 x float> @test_mm512_maskz_fmsubadd_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
1422; X86-LABEL: test_mm512_maskz_fmsubadd_ps:
1423; X86:       ## %bb.0: ## %entry
1424; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
1425; X86-NEXT:    vfmsubadd213ps %zmm2, %zmm1, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x75,0xc9,0xa7,0xc2]
1426; X86-NEXT:    ## zmm0 {%k1} {z} = (zmm1 * zmm0) -/+ zmm2
1427; X86-NEXT:    retl ## encoding: [0xc3]
1428;
1429; X64-LABEL: test_mm512_maskz_fmsubadd_ps:
1430; X64:       ## %bb.0: ## %entry
1431; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
1432; X64-NEXT:    vfmsubadd213ps %zmm2, %zmm1, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x75,0xc9,0xa7,0xc2]
1433; X64-NEXT:    ## zmm0 {%k1} {z} = (zmm1 * zmm0) -/+ zmm2
1434; X64-NEXT:    retq ## encoding: [0xc3]
1435entry:
1436  %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
1437  %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub.i) #10
1438  %1 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) #10
1439  %2 = shufflevector <16 x float> %1, <16 x float> %0, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
1440  %3 = bitcast i16 %__U to <16 x i1>
1441  %4 = select <16 x i1> %3, <16 x float> %2, <16 x float> zeroinitializer
1442  ret <16 x float> %4
1443}
1444
1445define <8 x double> @test_mm512_mask3_fmsub_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) {
1446; X86-LABEL: test_mm512_mask3_fmsub_round_pd:
1447; X86:       ## %bb.0: ## %entry
1448; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04]
1449; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
1450; X86-NEXT:    vfmsub231pd {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x19,0xba,0xd1]
1451; X86-NEXT:    vmovapd %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x28,0xc2]
1452; X86-NEXT:    retl ## encoding: [0xc3]
1453;
1454; X64-LABEL: test_mm512_mask3_fmsub_round_pd:
1455; X64:       ## %bb.0: ## %entry
1456; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
1457; X64-NEXT:    vfmsub231pd {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x19,0xba,0xd1]
1458; X64-NEXT:    vmovapd %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x28,0xc2]
1459; X64-NEXT:    retq ## encoding: [0xc3]
1460entry:
1461  %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
1462  %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub, i32 8)
1463  %1 = bitcast i8 %__U to <8 x i1>
1464  %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__C
1465  ret <8 x double> %2
1466}
1467
1468define <8 x double> @test_mm512_mask3_fmsub_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) {
1469; X86-LABEL: test_mm512_mask3_fmsub_pd:
1470; X86:       ## %bb.0: ## %entry
1471; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04]
1472; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
1473; X86-NEXT:    vfmsub231pd %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0xba,0xd1]
1474; X86-NEXT:    ## zmm2 {%k1} = (zmm0 * zmm1) - zmm2
1475; X86-NEXT:    vmovapd %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x28,0xc2]
1476; X86-NEXT:    retl ## encoding: [0xc3]
1477;
1478; X64-LABEL: test_mm512_mask3_fmsub_pd:
1479; X64:       ## %bb.0: ## %entry
1480; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
1481; X64-NEXT:    vfmsub231pd %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0xba,0xd1]
1482; X64-NEXT:    ## zmm2 {%k1} = (zmm0 * zmm1) - zmm2
1483; X64-NEXT:    vmovapd %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x28,0xc2]
1484; X64-NEXT:    retq ## encoding: [0xc3]
1485entry:
1486  %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
1487  %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub.i) #10
1488  %1 = bitcast i8 %__U to <8 x i1>
1489  %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__C
1490  ret <8 x double> %2
1491}
1492
1493define <16 x float> @test_mm512_mask3_fmsub_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) {
1494; X86-LABEL: test_mm512_mask3_fmsub_round_ps:
1495; X86:       ## %bb.0: ## %entry
1496; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
1497; X86-NEXT:    vfmsub231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x19,0xba,0xd1]
1498; X86-NEXT:    vmovaps %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2]
1499; X86-NEXT:    retl ## encoding: [0xc3]
1500;
1501; X64-LABEL: test_mm512_mask3_fmsub_round_ps:
1502; X64:       ## %bb.0: ## %entry
1503; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
1504; X64-NEXT:    vfmsub231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x19,0xba,0xd1]
1505; X64-NEXT:    vmovaps %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2]
1506; X64-NEXT:    retq ## encoding: [0xc3]
1507entry:
1508  %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
1509  %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub, i32 8)
1510  %1 = bitcast i16 %__U to <16 x i1>
1511  %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__C
1512  ret <16 x float> %2
1513}
1514
1515define <16 x float> @test_mm512_mask3_fmsub_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) {
1516; X86-LABEL: test_mm512_mask3_fmsub_ps:
1517; X86:       ## %bb.0: ## %entry
1518; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
1519; X86-NEXT:    vfmsub231ps %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0xba,0xd1]
1520; X86-NEXT:    ## zmm2 {%k1} = (zmm0 * zmm1) - zmm2
1521; X86-NEXT:    vmovaps %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2]
1522; X86-NEXT:    retl ## encoding: [0xc3]
1523;
1524; X64-LABEL: test_mm512_mask3_fmsub_ps:
1525; X64:       ## %bb.0: ## %entry
1526; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
1527; X64-NEXT:    vfmsub231ps %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0xba,0xd1]
1528; X64-NEXT:    ## zmm2 {%k1} = (zmm0 * zmm1) - zmm2
1529; X64-NEXT:    vmovaps %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2]
1530; X64-NEXT:    retq ## encoding: [0xc3]
1531entry:
1532  %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
1533  %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub.i) #10
1534  %1 = bitcast i16 %__U to <16 x i1>
1535  %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__C
1536  ret <16 x float> %2
1537}
1538
1539define <8 x double> @test_mm512_mask3_fmsubadd_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) {
1540; X86-LABEL: test_mm512_mask3_fmsubadd_round_pd:
1541; X86:       ## %bb.0: ## %entry
1542; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04]
1543; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
1544; X86-NEXT:    vfmsubadd231pd {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x19,0xb7,0xd1]
1545; X86-NEXT:    vmovapd %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x28,0xc2]
1546; X86-NEXT:    retl ## encoding: [0xc3]
1547;
1548; X64-LABEL: test_mm512_mask3_fmsubadd_round_pd:
1549; X64:       ## %bb.0: ## %entry
1550; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
1551; X64-NEXT:    vfmsubadd231pd {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x19,0xb7,0xd1]
1552; X64-NEXT:    vmovapd %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x28,0xc2]
1553; X64-NEXT:    retq ## encoding: [0xc3]
1554entry:
1555  %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
1556  %0 = tail call <8 x double> @llvm.x86.avx512.vfmaddsub.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub, i32 8)
1557  %1 = bitcast i8 %__U to <8 x i1>
1558  %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__C
1559  ret <8 x double> %2
1560}
1561
1562define <8 x double> @test_mm512_mask3_fmsubadd_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) {
1563; X86-LABEL: test_mm512_mask3_fmsubadd_pd:
1564; X86:       ## %bb.0: ## %entry
1565; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04]
1566; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
1567; X86-NEXT:    vfmsubadd231pd %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0xb7,0xd1]
1568; X86-NEXT:    ## zmm2 {%k1} = (zmm0 * zmm1) -/+ zmm2
1569; X86-NEXT:    vmovapd %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x28,0xc2]
1570; X86-NEXT:    retl ## encoding: [0xc3]
1571;
1572; X64-LABEL: test_mm512_mask3_fmsubadd_pd:
1573; X64:       ## %bb.0: ## %entry
1574; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
1575; X64-NEXT:    vfmsubadd231pd %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0xb7,0xd1]
1576; X64-NEXT:    ## zmm2 {%k1} = (zmm0 * zmm1) -/+ zmm2
1577; X64-NEXT:    vmovapd %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x28,0xc2]
1578; X64-NEXT:    retq ## encoding: [0xc3]
1579entry:
1580  %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
1581  %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub.i) #10
1582  %1 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) #10
1583  %2 = shufflevector <8 x double> %1, <8 x double> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
1584  %3 = bitcast i8 %__U to <8 x i1>
1585  %4 = select <8 x i1> %3, <8 x double> %2, <8 x double> %__C
1586  ret <8 x double> %4
1587}
1588
1589define <16 x float> @test_mm512_mask3_fmsubadd_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) {
1590; X86-LABEL: test_mm512_mask3_fmsubadd_round_ps:
1591; X86:       ## %bb.0: ## %entry
1592; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
1593; X86-NEXT:    vfmsubadd231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x19,0xb7,0xd1]
1594; X86-NEXT:    vmovaps %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2]
1595; X86-NEXT:    retl ## encoding: [0xc3]
1596;
1597; X64-LABEL: test_mm512_mask3_fmsubadd_round_ps:
1598; X64:       ## %bb.0: ## %entry
1599; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
1600; X64-NEXT:    vfmsubadd231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x19,0xb7,0xd1]
1601; X64-NEXT:    vmovaps %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2]
1602; X64-NEXT:    retq ## encoding: [0xc3]
1603entry:
1604  %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
1605  %0 = tail call <16 x float> @llvm.x86.avx512.vfmaddsub.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub, i32 8)
1606  %1 = bitcast i16 %__U to <16 x i1>
1607  %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__C
1608  ret <16 x float> %2
1609}
1610
1611define <16 x float> @test_mm512_mask3_fmsubadd_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) {
1612; X86-LABEL: test_mm512_mask3_fmsubadd_ps:
1613; X86:       ## %bb.0: ## %entry
1614; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
1615; X86-NEXT:    vfmsubadd231ps %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0xb7,0xd1]
1616; X86-NEXT:    ## zmm2 {%k1} = (zmm0 * zmm1) -/+ zmm2
1617; X86-NEXT:    vmovaps %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2]
1618; X86-NEXT:    retl ## encoding: [0xc3]
1619;
1620; X64-LABEL: test_mm512_mask3_fmsubadd_ps:
1621; X64:       ## %bb.0: ## %entry
1622; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
1623; X64-NEXT:    vfmsubadd231ps %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0xb7,0xd1]
1624; X64-NEXT:    ## zmm2 {%k1} = (zmm0 * zmm1) -/+ zmm2
1625; X64-NEXT:    vmovaps %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2]
1626; X64-NEXT:    retq ## encoding: [0xc3]
1627entry:
1628  %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
1629  %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub.i) #10
1630  %1 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) #10
1631  %2 = shufflevector <16 x float> %1, <16 x float> %0, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
1632  %3 = bitcast i16 %__U to <16 x i1>
1633  %4 = select <16 x i1> %3, <16 x float> %2, <16 x float> %__C
1634  ret <16 x float> %4
1635}
1636
1637define <8 x double> @test_mm512_mask_fnmadd_round_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) {
1638; X86-LABEL: test_mm512_mask_fnmadd_round_pd:
1639; X86:       ## %bb.0: ## %entry
1640; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04]
1641; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
1642; X86-NEXT:    vfnmadd132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1} ## encoding: [0x62,0xf2,0xed,0x19,0x9c,0xc1]
1643; X86-NEXT:    retl ## encoding: [0xc3]
1644;
1645; X64-LABEL: test_mm512_mask_fnmadd_round_pd:
1646; X64:       ## %bb.0: ## %entry
1647; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
1648; X64-NEXT:    vfnmadd132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1} ## encoding: [0x62,0xf2,0xed,0x19,0x9c,0xc1]
1649; X64-NEXT:    retq ## encoding: [0xc3]
1650entry:
1651  %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A
1652  %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %sub, <8 x double> %__B, <8 x double> %__C, i32 8)
1653  %1 = bitcast i8 %__U to <8 x i1>
1654  %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__A
1655  ret <8 x double> %2
1656}
1657
1658define <8 x double> @test_mm512_mask_fnmadd_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) {
1659; X86-LABEL: test_mm512_mask_fnmadd_pd:
1660; X86:       ## %bb.0: ## %entry
1661; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04]
1662; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
1663; X86-NEXT:    vfnmadd132pd %zmm1, %zmm2, %zmm0 {%k1} ## encoding: [0x62,0xf2,0xed,0x49,0x9c,0xc1]
1664; X86-NEXT:    ## zmm0 {%k1} = -(zmm0 * zmm1) + zmm2
1665; X86-NEXT:    retl ## encoding: [0xc3]
1666;
1667; X64-LABEL: test_mm512_mask_fnmadd_pd:
1668; X64:       ## %bb.0: ## %entry
1669; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
1670; X64-NEXT:    vfnmadd132pd %zmm1, %zmm2, %zmm0 {%k1} ## encoding: [0x62,0xf2,0xed,0x49,0x9c,0xc1]
1671; X64-NEXT:    ## zmm0 {%k1} = -(zmm0 * zmm1) + zmm2
1672; X64-NEXT:    retq ## encoding: [0xc3]
1673entry:
1674  %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A
1675  %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %sub.i, <8 x double> %__B, <8 x double> %__C) #10
1676  %1 = bitcast i8 %__U to <8 x i1>
1677  %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__A
1678  ret <8 x double> %2
1679}
1680
1681define <16 x float> @test_mm512_mask_fnmadd_round_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) {
1682; X86-LABEL: test_mm512_mask_fnmadd_round_ps:
1683; X86:       ## %bb.0: ## %entry
1684; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
1685; X86-NEXT:    vfnmadd132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1} ## encoding: [0x62,0xf2,0x6d,0x19,0x9c,0xc1]
1686; X86-NEXT:    retl ## encoding: [0xc3]
1687;
1688; X64-LABEL: test_mm512_mask_fnmadd_round_ps:
1689; X64:       ## %bb.0: ## %entry
1690; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
1691; X64-NEXT:    vfnmadd132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1} ## encoding: [0x62,0xf2,0x6d,0x19,0x9c,0xc1]
1692; X64-NEXT:    retq ## encoding: [0xc3]
1693entry:
1694  %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
1695  %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %sub, <16 x float> %__B, <16 x float> %__C, i32 8)
1696  %1 = bitcast i16 %__U to <16 x i1>
1697  %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__A
1698  ret <16 x float> %2
1699}
1700
1701define <16 x float> @test_mm512_mask_fnmadd_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) {
1702; X86-LABEL: test_mm512_mask_fnmadd_ps:
1703; X86:       ## %bb.0: ## %entry
1704; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
1705; X86-NEXT:    vfnmadd132ps %zmm1, %zmm2, %zmm0 {%k1} ## encoding: [0x62,0xf2,0x6d,0x49,0x9c,0xc1]
1706; X86-NEXT:    ## zmm0 {%k1} = -(zmm0 * zmm1) + zmm2
1707; X86-NEXT:    retl ## encoding: [0xc3]
1708;
1709; X64-LABEL: test_mm512_mask_fnmadd_ps:
1710; X64:       ## %bb.0: ## %entry
1711; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
1712; X64-NEXT:    vfnmadd132ps %zmm1, %zmm2, %zmm0 {%k1} ## encoding: [0x62,0xf2,0x6d,0x49,0x9c,0xc1]
1713; X64-NEXT:    ## zmm0 {%k1} = -(zmm0 * zmm1) + zmm2
1714; X64-NEXT:    retq ## encoding: [0xc3]
1715entry:
1716  %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
1717  %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %sub.i, <16 x float> %__B, <16 x float> %__C) #10
1718  %1 = bitcast i16 %__U to <16 x i1>
1719  %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__A
1720  ret <16 x float> %2
1721}
1722
1723define <8 x double> @test_mm512_mask_fnmsub_round_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) {
1724; X86-LABEL: test_mm512_mask_fnmsub_round_pd:
1725; X86:       ## %bb.0: ## %entry
1726; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04]
1727; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
1728; X86-NEXT:    vfnmsub132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1} ## encoding: [0x62,0xf2,0xed,0x19,0x9e,0xc1]
1729; X86-NEXT:    retl ## encoding: [0xc3]
1730;
1731; X64-LABEL: test_mm512_mask_fnmsub_round_pd:
1732; X64:       ## %bb.0: ## %entry
1733; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
1734; X64-NEXT:    vfnmsub132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1} ## encoding: [0x62,0xf2,0xed,0x19,0x9e,0xc1]
1735; X64-NEXT:    retq ## encoding: [0xc3]
1736entry:
1737  %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__B
1738  %sub1 = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
1739  %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %__A, <8 x double> %sub, <8 x double> %sub1, i32 8)
1740  %1 = bitcast i8 %__U to <8 x i1>
1741  %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__A
1742  ret <8 x double> %2
1743}
1744
1745define <8 x double> @test_mm512_mask3_fnmsub_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) {
1746; X86-LABEL: test_mm512_mask3_fnmsub_round_pd:
1747; X86:       ## %bb.0: ## %entry
1748; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04]
1749; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
1750; X86-NEXT:    vfnmsub231pd {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x19,0xbe,0xd1]
1751; X86-NEXT:    vmovapd %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x28,0xc2]
1752; X86-NEXT:    retl ## encoding: [0xc3]
1753;
1754; X64-LABEL: test_mm512_mask3_fnmsub_round_pd:
1755; X64:       ## %bb.0: ## %entry
1756; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
1757; X64-NEXT:    vfnmsub231pd {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x19,0xbe,0xd1]
1758; X64-NEXT:    vmovapd %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x28,0xc2]
1759; X64-NEXT:    retq ## encoding: [0xc3]
1760entry:
1761  %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__B
1762  %sub1 = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
1763  %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %__A, <8 x double> %sub, <8 x double> %sub1, i32 8)
1764  %1 = bitcast i8 %__U to <8 x i1>
1765  %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__C
1766  ret <8 x double> %2
1767}
1768
1769define <8 x double> @test_mm512_mask_fnmsub_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) {
1770; X86-LABEL: test_mm512_mask_fnmsub_pd:
1771; X86:       ## %bb.0: ## %entry
1772; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04]
1773; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
1774; X86-NEXT:    vfnmsub132pd %zmm1, %zmm2, %zmm0 {%k1} ## encoding: [0x62,0xf2,0xed,0x49,0x9e,0xc1]
1775; X86-NEXT:    ## zmm0 {%k1} = -(zmm0 * zmm1) - zmm2
1776; X86-NEXT:    retl ## encoding: [0xc3]
1777;
1778; X64-LABEL: test_mm512_mask_fnmsub_pd:
1779; X64:       ## %bb.0: ## %entry
1780; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
1781; X64-NEXT:    vfnmsub132pd %zmm1, %zmm2, %zmm0 {%k1} ## encoding: [0x62,0xf2,0xed,0x49,0x9e,0xc1]
1782; X64-NEXT:    ## zmm0 {%k1} = -(zmm0 * zmm1) - zmm2
1783; X64-NEXT:    retq ## encoding: [0xc3]
1784entry:
1785  %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__B
1786  %sub2.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
1787  %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %sub.i, <8 x double> %sub2.i) #10
1788  %1 = bitcast i8 %__U to <8 x i1>
1789  %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__A
1790  ret <8 x double> %2
1791}
1792
1793define <8 x double> @test_mm512_mask3_fnmsub_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) {
1794; X86-LABEL: test_mm512_mask3_fnmsub_pd:
1795; X86:       ## %bb.0: ## %entry
1796; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04]
1797; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
1798; X86-NEXT:    vfnmsub231pd %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0xbe,0xd1]
1799; X86-NEXT:    ## zmm2 {%k1} = -(zmm0 * zmm1) - zmm2
1800; X86-NEXT:    vmovapd %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x28,0xc2]
1801; X86-NEXT:    retl ## encoding: [0xc3]
1802;
1803; X64-LABEL: test_mm512_mask3_fnmsub_pd:
1804; X64:       ## %bb.0: ## %entry
1805; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
1806; X64-NEXT:    vfnmsub231pd %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0xbe,0xd1]
1807; X64-NEXT:    ## zmm2 {%k1} = -(zmm0 * zmm1) - zmm2
1808; X64-NEXT:    vmovapd %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x28,0xc2]
1809; X64-NEXT:    retq ## encoding: [0xc3]
1810entry:
1811  %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__B
1812  %sub2.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
1813  %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %sub.i, <8 x double> %sub2.i) #10
1814  %1 = bitcast i8 %__U to <8 x i1>
1815  %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__C
1816  ret <8 x double> %2
1817}
1818
1819define <16 x float> @test_mm512_mask_fnmsub_round_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) {
1820; X86-LABEL: test_mm512_mask_fnmsub_round_ps:
1821; X86:       ## %bb.0: ## %entry
1822; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
1823; X86-NEXT:    vfnmsub132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1} ## encoding: [0x62,0xf2,0x6d,0x19,0x9e,0xc1]
1824; X86-NEXT:    retl ## encoding: [0xc3]
1825;
1826; X64-LABEL: test_mm512_mask_fnmsub_round_ps:
1827; X64:       ## %bb.0: ## %entry
1828; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
1829; X64-NEXT:    vfnmsub132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1} ## encoding: [0x62,0xf2,0x6d,0x19,0x9e,0xc1]
1830; X64-NEXT:    retq ## encoding: [0xc3]
1831entry:
1832  %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__B
1833  %sub1 = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
1834  %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %__A, <16 x float> %sub, <16 x float> %sub1, i32 8)
1835  %1 = bitcast i16 %__U to <16 x i1>
1836  %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__A
1837  ret <16 x float> %2
1838}
1839
1840define <16 x float> @test_mm512_mask3_fnmsub_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) {
1841; X86-LABEL: test_mm512_mask3_fnmsub_round_ps:
1842; X86:       ## %bb.0: ## %entry
1843; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
1844; X86-NEXT:    vfnmsub231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x19,0xbe,0xd1]
1845; X86-NEXT:    vmovaps %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2]
1846; X86-NEXT:    retl ## encoding: [0xc3]
1847;
1848; X64-LABEL: test_mm512_mask3_fnmsub_round_ps:
1849; X64:       ## %bb.0: ## %entry
1850; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
1851; X64-NEXT:    vfnmsub231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x19,0xbe,0xd1]
1852; X64-NEXT:    vmovaps %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2]
1853; X64-NEXT:    retq ## encoding: [0xc3]
1854entry:
1855  %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__B
1856  %sub1 = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
1857  %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %__A, <16 x float> %sub, <16 x float> %sub1, i32 8)
1858  %1 = bitcast i16 %__U to <16 x i1>
1859  %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__C
1860  ret <16 x float> %2
1861}
1862
1863define <16 x float> @test_mm512_mask_fnmsub_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) {
1864; X86-LABEL: test_mm512_mask_fnmsub_ps:
1865; X86:       ## %bb.0: ## %entry
1866; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
1867; X86-NEXT:    vfnmsub132ps %zmm1, %zmm2, %zmm0 {%k1} ## encoding: [0x62,0xf2,0x6d,0x49,0x9e,0xc1]
1868; X86-NEXT:    ## zmm0 {%k1} = -(zmm0 * zmm1) - zmm2
1869; X86-NEXT:    retl ## encoding: [0xc3]
1870;
1871; X64-LABEL: test_mm512_mask_fnmsub_ps:
1872; X64:       ## %bb.0: ## %entry
1873; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
1874; X64-NEXT:    vfnmsub132ps %zmm1, %zmm2, %zmm0 {%k1} ## encoding: [0x62,0xf2,0x6d,0x49,0x9e,0xc1]
1875; X64-NEXT:    ## zmm0 {%k1} = -(zmm0 * zmm1) - zmm2
1876; X64-NEXT:    retq ## encoding: [0xc3]
1877entry:
1878  %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__B
1879  %sub1.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
1880  %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %sub.i, <16 x float> %sub1.i) #10
1881  %1 = bitcast i16 %__U to <16 x i1>
1882  %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__A
1883  ret <16 x float> %2
1884}
1885
1886define <16 x float> @test_mm512_mask3_fnmsub_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) {
1887; X86-LABEL: test_mm512_mask3_fnmsub_ps:
1888; X86:       ## %bb.0: ## %entry
1889; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
1890; X86-NEXT:    vfnmsub231ps %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0xbe,0xd1]
1891; X86-NEXT:    ## zmm2 {%k1} = -(zmm0 * zmm1) - zmm2
1892; X86-NEXT:    vmovaps %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2]
1893; X86-NEXT:    retl ## encoding: [0xc3]
1894;
1895; X64-LABEL: test_mm512_mask3_fnmsub_ps:
1896; X64:       ## %bb.0: ## %entry
1897; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
1898; X64-NEXT:    vfnmsub231ps %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0xbe,0xd1]
1899; X64-NEXT:    ## zmm2 {%k1} = -(zmm0 * zmm1) - zmm2
1900; X64-NEXT:    vmovaps %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2]
1901; X64-NEXT:    retq ## encoding: [0xc3]
1902entry:
1903  %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__B
1904  %sub1.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
1905  %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %sub.i, <16 x float> %sub1.i) #10
1906  %1 = bitcast i16 %__U to <16 x i1>
1907  %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__C
1908  ret <16 x float> %2
1909}
1910
1911define <4 x float> @test_mm_mask_fmadd_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
1912; X86-LABEL: test_mm_mask_fmadd_ss:
1913; X86:       ## %bb.0: ## %entry
1914; X86-NEXT:    movb {{[0-9]+}}(%esp), %al ## encoding: [0x8a,0x44,0x24,0x04]
1915; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
1916; X86-NEXT:    vfmadd213ss %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xa9,0xc2]
1917; X86-NEXT:    ## xmm0 {%k1} = (xmm1 * xmm0) + xmm2
1918; X86-NEXT:    retl ## encoding: [0xc3]
1919;
1920; X64-LABEL: test_mm_mask_fmadd_ss:
1921; X64:       ## %bb.0: ## %entry
1922; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
1923; X64-NEXT:    vfmadd213ss %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xa9,0xc2]
1924; X64-NEXT:    ## xmm0 {%k1} = (xmm1 * xmm0) + xmm2
1925; X64-NEXT:    retq ## encoding: [0xc3]
1926entry:
1927  %0 = extractelement <4 x float> %__W, i64 0
1928  %1 = extractelement <4 x float> %__A, i64 0
1929  %2 = extractelement <4 x float> %__B, i64 0
1930  %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #10
1931  %4 = and i8 %__U, 1
1932  %tobool.i = icmp eq i8 %4, 0
1933  %vecext1.i = extractelement <4 x float> %__W, i32 0
1934  %cond.i = select i1 %tobool.i, float %vecext1.i, float %3
1935  %vecins.i = insertelement <4 x float> %__W, float %cond.i, i32 0
1936  ret <4 x float> %vecins.i
1937}
1938
1939define <4 x float> @test_mm_mask_fmadd_round_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
1940; X86-LABEL: test_mm_mask_fmadd_round_ss:
1941; X86:       ## %bb.0: ## %entry
1942; X86-NEXT:    movb {{[0-9]+}}(%esp), %al ## encoding: [0x8a,0x44,0x24,0x04]
1943; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
1944; X86-NEXT:    vfmadd213ss %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xa9,0xc2]
1945; X86-NEXT:    ## xmm0 {%k1} = (xmm1 * xmm0) + xmm2
1946; X86-NEXT:    retl ## encoding: [0xc3]
1947;
1948; X64-LABEL: test_mm_mask_fmadd_round_ss:
1949; X64:       ## %bb.0: ## %entry
1950; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
1951; X64-NEXT:    vfmadd213ss %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xa9,0xc2]
1952; X64-NEXT:    ## xmm0 {%k1} = (xmm1 * xmm0) + xmm2
1953; X64-NEXT:    retq ## encoding: [0xc3]
1954entry:
1955  %0 = tail call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %__W, <4 x float> %__A, <4 x float> %__B, i8 %__U, i32 4)
1956  ret <4 x float> %0
1957}
1958
1959declare <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32) #1
1960
1961define <4 x float> @test_mm_maskz_fmadd_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) {
1962; X86-LABEL: test_mm_maskz_fmadd_ss:
1963; X86:       ## %bb.0: ## %entry
1964; X86-NEXT:    movb {{[0-9]+}}(%esp), %al ## encoding: [0x8a,0x44,0x24,0x04]
1965; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
1966; X86-NEXT:    vfmadd213ss %xmm2, %xmm1, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x75,0x89,0xa9,0xc2]
1967; X86-NEXT:    ## xmm0 {%k1} {z} = (xmm1 * xmm0) + xmm2
1968; X86-NEXT:    retl ## encoding: [0xc3]
1969;
1970; X64-LABEL: test_mm_maskz_fmadd_ss:
1971; X64:       ## %bb.0: ## %entry
1972; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
1973; X64-NEXT:    vfmadd213ss %xmm2, %xmm1, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x75,0x89,0xa9,0xc2]
1974; X64-NEXT:    ## xmm0 {%k1} {z} = (xmm1 * xmm0) + xmm2
1975; X64-NEXT:    retq ## encoding: [0xc3]
1976entry:
1977  %0 = extractelement <4 x float> %__A, i64 0
1978  %1 = extractelement <4 x float> %__B, i64 0
1979  %2 = extractelement <4 x float> %__C, i64 0
1980  %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #10
1981  %4 = and i8 %__U, 1
1982  %tobool.i = icmp eq i8 %4, 0
1983  %cond.i = select i1 %tobool.i, float 0.000000e+00, float %3
1984  %vecins.i = insertelement <4 x float> %__A, float %cond.i, i32 0
1985  ret <4 x float> %vecins.i
1986}
1987
1988define <4 x float> @test_mm_maskz_fmadd_round_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) {
1989; X86-LABEL: test_mm_maskz_fmadd_round_ss:
1990; X86:       ## %bb.0: ## %entry
1991; X86-NEXT:    movb {{[0-9]+}}(%esp), %al ## encoding: [0x8a,0x44,0x24,0x04]
1992; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
1993; X86-NEXT:    vfmadd213ss %xmm2, %xmm1, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x75,0x89,0xa9,0xc2]
1994; X86-NEXT:    ## xmm0 {%k1} {z} = (xmm1 * xmm0) + xmm2
1995; X86-NEXT:    retl ## encoding: [0xc3]
1996;
1997; X64-LABEL: test_mm_maskz_fmadd_round_ss:
1998; X64:       ## %bb.0: ## %entry
1999; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
2000; X64-NEXT:    vfmadd213ss %xmm2, %xmm1, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x75,0x89,0xa9,0xc2]
2001; X64-NEXT:    ## xmm0 {%k1} {z} = (xmm1 * xmm0) + xmm2
2002; X64-NEXT:    retq ## encoding: [0xc3]
2003entry:
2004  %0 = tail call <4 x float> @llvm.x86.avx512.maskz.vfmadd.ss(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C, i8 %__U, i32 4)
2005  ret <4 x float> %0
2006}
2007
2008declare <4 x float> @llvm.x86.avx512.maskz.vfmadd.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32) #1
2009
2010define <4 x float> @test_mm_mask3_fmadd_ss(<4 x float> %__W, <4 x float> %__X, <4 x float> %__Y, i8 zeroext %__U) {
2011; X86-LABEL: test_mm_mask3_fmadd_ss:
2012; X86:       ## %bb.0: ## %entry
2013; X86-NEXT:    movb {{[0-9]+}}(%esp), %al ## encoding: [0x8a,0x44,0x24,0x04]
2014; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
2015; X86-NEXT:    vfmadd231ss %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0xb9,0xd1]
2016; X86-NEXT:    ## xmm2 {%k1} = (xmm0 * xmm1) + xmm2
2017; X86-NEXT:    vmovaps %xmm2, %xmm0 ## encoding: [0xc5,0xf8,0x28,0xc2]
2018; X86-NEXT:    retl ## encoding: [0xc3]
2019;
2020; X64-LABEL: test_mm_mask3_fmadd_ss:
2021; X64:       ## %bb.0: ## %entry
2022; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
2023; X64-NEXT:    vfmadd231ss %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0xb9,0xd1]
2024; X64-NEXT:    ## xmm2 {%k1} = (xmm0 * xmm1) + xmm2
2025; X64-NEXT:    vmovaps %xmm2, %xmm0 ## encoding: [0xc5,0xf8,0x28,0xc2]
2026; X64-NEXT:    retq ## encoding: [0xc3]
2027entry:
2028  %0 = extractelement <4 x float> %__W, i64 0
2029  %1 = extractelement <4 x float> %__X, i64 0
2030  %2 = extractelement <4 x float> %__Y, i64 0
2031  %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #10
2032  %4 = and i8 %__U, 1
2033  %tobool.i = icmp eq i8 %4, 0
2034  %vecext1.i = extractelement <4 x float> %__Y, i32 0
2035  %cond.i = select i1 %tobool.i, float %vecext1.i, float %3
2036  %vecins.i = insertelement <4 x float> %__Y, float %cond.i, i32 0
2037  ret <4 x float> %vecins.i
2038}
2039
2040define <4 x float> @test_mm_mask3_fmadd_round_ss(<4 x float> %__W, <4 x float> %__X, <4 x float> %__Y, i8 zeroext %__U) {
2041; X86-LABEL: test_mm_mask3_fmadd_round_ss:
2042; X86:       ## %bb.0: ## %entry
2043; X86-NEXT:    movb {{[0-9]+}}(%esp), %al ## encoding: [0x8a,0x44,0x24,0x04]
2044; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
2045; X86-NEXT:    vfmadd231ss %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0xb9,0xd1]
2046; X86-NEXT:    ## xmm2 {%k1} = (xmm0 * xmm1) + xmm2
2047; X86-NEXT:    vmovaps %xmm2, %xmm0 ## encoding: [0xc5,0xf8,0x28,0xc2]
2048; X86-NEXT:    retl ## encoding: [0xc3]
2049;
2050; X64-LABEL: test_mm_mask3_fmadd_round_ss:
2051; X64:       ## %bb.0: ## %entry
2052; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
2053; X64-NEXT:    vfmadd231ss %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0xb9,0xd1]
2054; X64-NEXT:    ## xmm2 {%k1} = (xmm0 * xmm1) + xmm2
2055; X64-NEXT:    vmovaps %xmm2, %xmm0 ## encoding: [0xc5,0xf8,0x28,0xc2]
2056; X64-NEXT:    retq ## encoding: [0xc3]
2057entry:
2058  %0 = tail call <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float> %__W, <4 x float> %__X, <4 x float> %__Y, i8 %__U, i32 4)
2059  ret <4 x float> %0
2060}
2061
2062declare <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32) #1
2063
2064define <4 x float> @test_mm_mask_fmsub_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
2065; X86-LABEL: test_mm_mask_fmsub_ss:
2066; X86:       ## %bb.0: ## %entry
2067; X86-NEXT:    movb {{[0-9]+}}(%esp), %al ## encoding: [0x8a,0x44,0x24,0x04]
2068; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
2069; X86-NEXT:    vfmsub213ss %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xab,0xc2]
2070; X86-NEXT:    ## xmm0 {%k1} = (xmm1 * xmm0) - xmm2
2071; X86-NEXT:    retl ## encoding: [0xc3]
2072;
2073; X64-LABEL: test_mm_mask_fmsub_ss:
2074; X64:       ## %bb.0: ## %entry
2075; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
2076; X64-NEXT:    vfmsub213ss %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xab,0xc2]
2077; X64-NEXT:    ## xmm0 {%k1} = (xmm1 * xmm0) - xmm2
2078; X64-NEXT:    retq ## encoding: [0xc3]
2079entry:
2080  %0 = extractelement <4 x float> %__W, i64 0
2081  %1 = extractelement <4 x float> %__A, i64 0
2082  %.rhs.i = extractelement <4 x float> %__B, i64 0
2083  %2 = fsub float -0.000000e+00, %.rhs.i
2084  %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #10
2085  %4 = and i8 %__U, 1
2086  %tobool.i = icmp eq i8 %4, 0
2087  %vecext1.i = extractelement <4 x float> %__W, i32 0
2088  %cond.i = select i1 %tobool.i, float %vecext1.i, float %3
2089  %vecins.i = insertelement <4 x float> %__W, float %cond.i, i32 0
2090  ret <4 x float> %vecins.i
2091}
2092
2093define <4 x float> @test_mm_mask_fmsub_round_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
2094; X86-LABEL: test_mm_mask_fmsub_round_ss:
2095; X86:       ## %bb.0: ## %entry
2096; X86-NEXT:    movb {{[0-9]+}}(%esp), %al ## encoding: [0x8a,0x44,0x24,0x04]
2097; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
2098; X86-NEXT:    vfmsub213ss %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xab,0xc2]
2099; X86-NEXT:    ## xmm0 {%k1} = (xmm1 * xmm0) - xmm2
2100; X86-NEXT:    retl ## encoding: [0xc3]
2101;
2102; X64-LABEL: test_mm_mask_fmsub_round_ss:
2103; X64:       ## %bb.0: ## %entry
2104; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
2105; X64-NEXT:    vfmsub213ss %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xab,0xc2]
2106; X64-NEXT:    ## xmm0 {%k1} = (xmm1 * xmm0) - xmm2
2107; X64-NEXT:    retq ## encoding: [0xc3]
2108entry:
2109  %sub = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__B
2110  %0 = tail call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %__W, <4 x float> %__A, <4 x float> %sub, i8 %__U, i32 4)
2111  ret <4 x float> %0
2112}
2113
2114define <4 x float> @test_mm_maskz_fmsub_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) {
2115; X86-LABEL: test_mm_maskz_fmsub_ss:
2116; X86:       ## %bb.0: ## %entry
2117; X86-NEXT:    movb {{[0-9]+}}(%esp), %al ## encoding: [0x8a,0x44,0x24,0x04]
2118; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
2119; X86-NEXT:    vfmsub213ss %xmm2, %xmm1, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x75,0x89,0xab,0xc2]
2120; X86-NEXT:    ## xmm0 {%k1} {z} = (xmm1 * xmm0) - xmm2
2121; X86-NEXT:    retl ## encoding: [0xc3]
2122;
2123; X64-LABEL: test_mm_maskz_fmsub_ss:
2124; X64:       ## %bb.0: ## %entry
2125; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
2126; X64-NEXT:    vfmsub213ss %xmm2, %xmm1, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x75,0x89,0xab,0xc2]
2127; X64-NEXT:    ## xmm0 {%k1} {z} = (xmm1 * xmm0) - xmm2
2128; X64-NEXT:    retq ## encoding: [0xc3]
2129entry:
2130  %0 = extractelement <4 x float> %__A, i64 0
2131  %1 = extractelement <4 x float> %__B, i64 0
2132  %.rhs.i = extractelement <4 x float> %__C, i64 0
2133  %2 = fsub float -0.000000e+00, %.rhs.i
2134  %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #10
2135  %4 = and i8 %__U, 1
2136  %tobool.i = icmp eq i8 %4, 0
2137  %cond.i = select i1 %tobool.i, float 0.000000e+00, float %3
2138  %vecins.i = insertelement <4 x float> %__A, float %cond.i, i32 0
2139  ret <4 x float> %vecins.i
2140}
2141
2142define <4 x float> @test_mm_maskz_fmsub_round_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) {
2143; X86-LABEL: test_mm_maskz_fmsub_round_ss:
2144; X86:       ## %bb.0: ## %entry
2145; X86-NEXT:    movb {{[0-9]+}}(%esp), %al ## encoding: [0x8a,0x44,0x24,0x04]
2146; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
2147; X86-NEXT:    vfmsub213ss %xmm2, %xmm1, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x75,0x89,0xab,0xc2]
2148; X86-NEXT:    ## xmm0 {%k1} {z} = (xmm1 * xmm0) - xmm2
2149; X86-NEXT:    retl ## encoding: [0xc3]
2150;
2151; X64-LABEL: test_mm_maskz_fmsub_round_ss:
2152; X64:       ## %bb.0: ## %entry
2153; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
2154; X64-NEXT:    vfmsub213ss %xmm2, %xmm1, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x75,0x89,0xab,0xc2]
2155; X64-NEXT:    ## xmm0 {%k1} {z} = (xmm1 * xmm0) - xmm2
2156; X64-NEXT:    retq ## encoding: [0xc3]
2157entry:
2158  %sub = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
2159  %0 = tail call <4 x float> @llvm.x86.avx512.maskz.vfmadd.ss(<4 x float> %__A, <4 x float> %__B, <4 x float> %sub, i8 %__U, i32 4)
2160  ret <4 x float> %0
2161}
2162
2163define <4 x float> @test_mm_mask3_fmsub_ss(<4 x float> %__W, <4 x float> %__X, <4 x float> %__Y, i8 zeroext %__U) {
2164; X86-LABEL: test_mm_mask3_fmsub_ss:
2165; X86:       ## %bb.0: ## %entry
2166; X86-NEXT:    movb {{[0-9]+}}(%esp), %al ## encoding: [0x8a,0x44,0x24,0x04]
2167; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
2168; X86-NEXT:    vfmsub231ss %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0xbb,0xd1]
2169; X86-NEXT:    ## xmm2 {%k1} = (xmm0 * xmm1) - xmm2
2170; X86-NEXT:    vmovaps %xmm2, %xmm0 ## encoding: [0xc5,0xf8,0x28,0xc2]
2171; X86-NEXT:    retl ## encoding: [0xc3]
2172;
2173; X64-LABEL: test_mm_mask3_fmsub_ss:
2174; X64:       ## %bb.0: ## %entry
2175; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
2176; X64-NEXT:    vfmsub231ss %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0xbb,0xd1]
2177; X64-NEXT:    ## xmm2 {%k1} = (xmm0 * xmm1) - xmm2
2178; X64-NEXT:    vmovaps %xmm2, %xmm0 ## encoding: [0xc5,0xf8,0x28,0xc2]
2179; X64-NEXT:    retq ## encoding: [0xc3]
2180entry:
2181  %0 = extractelement <4 x float> %__W, i64 0
2182  %1 = extractelement <4 x float> %__X, i64 0
2183  %.rhs.i = extractelement <4 x float> %__Y, i64 0
2184  %2 = fsub float -0.000000e+00, %.rhs.i
2185  %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #10
2186  %4 = and i8 %__U, 1
2187  %tobool.i = icmp eq i8 %4, 0
2188  %vecext1.i = extractelement <4 x float> %__Y, i32 0
2189  %cond.i = select i1 %tobool.i, float %vecext1.i, float %3
2190  %vecins.i = insertelement <4 x float> %__Y, float %cond.i, i32 0
2191  ret <4 x float> %vecins.i
2192}
2193
2194define <4 x float> @test_mm_mask3_fmsub_round_ss(<4 x float> %__W, <4 x float> %__X, <4 x float> %__Y, i8 zeroext %__U) {
2195; X86-LABEL: test_mm_mask3_fmsub_round_ss:
2196; X86:       ## %bb.0: ## %entry
2197; X86-NEXT:    movb {{[0-9]+}}(%esp), %al ## encoding: [0x8a,0x44,0x24,0x04]
2198; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
2199; X86-NEXT:    vfmsub231ss %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0xbb,0xd1]
2200; X86-NEXT:    ## xmm2 {%k1} = (xmm0 * xmm1) - xmm2
2201; X86-NEXT:    vmovaps %xmm2, %xmm0 ## encoding: [0xc5,0xf8,0x28,0xc2]
2202; X86-NEXT:    retl ## encoding: [0xc3]
2203;
2204; X64-LABEL: test_mm_mask3_fmsub_round_ss:
2205; X64:       ## %bb.0: ## %entry
2206; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
2207; X64-NEXT:    vfmsub231ss %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0xbb,0xd1]
2208; X64-NEXT:    ## xmm2 {%k1} = (xmm0 * xmm1) - xmm2
2209; X64-NEXT:    vmovaps %xmm2, %xmm0 ## encoding: [0xc5,0xf8,0x28,0xc2]
2210; X64-NEXT:    retq ## encoding: [0xc3]
2211entry:
2212  %0 = tail call <4 x float> @llvm.x86.avx512.mask3.vfmsub.ss(<4 x float> %__W, <4 x float> %__X, <4 x float> %__Y, i8 %__U, i32 4)
2213  ret <4 x float> %0
2214}
2215
2216declare <4 x float> @llvm.x86.avx512.mask3.vfmsub.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32) #1
2217
2218define <4 x float> @test_mm_mask_fnmadd_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
2219; X86-LABEL: test_mm_mask_fnmadd_ss:
2220; X86:       ## %bb.0: ## %entry
2221; X86-NEXT:    movb {{[0-9]+}}(%esp), %al ## encoding: [0x8a,0x44,0x24,0x04]
2222; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
2223; X86-NEXT:    vfnmadd213ss %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xad,0xc2]
2224; X86-NEXT:    ## xmm0 {%k1} = -(xmm1 * xmm0) + xmm2
2225; X86-NEXT:    retl ## encoding: [0xc3]
2226;
2227; X64-LABEL: test_mm_mask_fnmadd_ss:
2228; X64:       ## %bb.0: ## %entry
2229; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
2230; X64-NEXT:    vfnmadd213ss %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xad,0xc2]
2231; X64-NEXT:    ## xmm0 {%k1} = -(xmm1 * xmm0) + xmm2
2232; X64-NEXT:    retq ## encoding: [0xc3]
2233entry:
2234  %0 = extractelement <4 x float> %__W, i64 0
2235  %.rhs.i = extractelement <4 x float> %__A, i64 0
2236  %1 = fsub float -0.000000e+00, %.rhs.i
2237  %2 = extractelement <4 x float> %__B, i64 0
2238  %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #10
2239  %4 = and i8 %__U, 1
2240  %tobool.i = icmp eq i8 %4, 0
2241  %vecext1.i = extractelement <4 x float> %__W, i32 0
2242  %cond.i = select i1 %tobool.i, float %vecext1.i, float %3
2243  %vecins.i = insertelement <4 x float> %__W, float %cond.i, i32 0
2244  ret <4 x float> %vecins.i
2245}
2246
2247define <4 x float> @test_mm_mask_fnmadd_round_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
2248; X86-LABEL: test_mm_mask_fnmadd_round_ss:
2249; X86:       ## %bb.0: ## %entry
2250; X86-NEXT:    movb {{[0-9]+}}(%esp), %al ## encoding: [0x8a,0x44,0x24,0x04]
2251; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
2252; X86-NEXT:    vfnmadd213ss %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xad,0xc2]
2253; X86-NEXT:    ## xmm0 {%k1} = -(xmm1 * xmm0) + xmm2
2254; X86-NEXT:    retl ## encoding: [0xc3]
2255;
2256; X64-LABEL: test_mm_mask_fnmadd_round_ss:
2257; X64:       ## %bb.0: ## %entry
2258; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
2259; X64-NEXT:    vfnmadd213ss %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xad,0xc2]
2260; X64-NEXT:    ## xmm0 {%k1} = -(xmm1 * xmm0) + xmm2
2261; X64-NEXT:    retq ## encoding: [0xc3]
2262entry:
2263  %sub = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
2264  %0 = tail call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %__W, <4 x float> %sub, <4 x float> %__B, i8 %__U, i32 4)
2265  ret <4 x float> %0
2266}
2267
2268define <4 x float> @test_mm_maskz_fnmadd_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) {
2269; X86-LABEL: test_mm_maskz_fnmadd_ss:
2270; X86:       ## %bb.0: ## %entry
2271; X86-NEXT:    movb {{[0-9]+}}(%esp), %al ## encoding: [0x8a,0x44,0x24,0x04]
2272; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
2273; X86-NEXT:    vfnmadd213ss %xmm2, %xmm1, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x75,0x89,0xad,0xc2]
2274; X86-NEXT:    ## xmm0 {%k1} {z} = -(xmm1 * xmm0) + xmm2
2275; X86-NEXT:    retl ## encoding: [0xc3]
2276;
2277; X64-LABEL: test_mm_maskz_fnmadd_ss:
2278; X64:       ## %bb.0: ## %entry
2279; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
2280; X64-NEXT:    vfnmadd213ss %xmm2, %xmm1, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x75,0x89,0xad,0xc2]
2281; X64-NEXT:    ## xmm0 {%k1} {z} = -(xmm1 * xmm0) + xmm2
2282; X64-NEXT:    retq ## encoding: [0xc3]
2283entry:
2284  %0 = extractelement <4 x float> %__A, i64 0
2285  %.rhs.i = extractelement <4 x float> %__B, i64 0
2286  %1 = fsub float -0.000000e+00, %.rhs.i
2287  %2 = extractelement <4 x float> %__C, i64 0
2288  %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #10
2289  %4 = and i8 %__U, 1
2290  %tobool.i = icmp eq i8 %4, 0
2291  %cond.i = select i1 %tobool.i, float 0.000000e+00, float %3
2292  %vecins.i = insertelement <4 x float> %__A, float %cond.i, i32 0
2293  ret <4 x float> %vecins.i
2294}
2295
2296define <4 x float> @test_mm_maskz_fnmadd_round_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) {
2297; X86-LABEL: test_mm_maskz_fnmadd_round_ss:
2298; X86:       ## %bb.0: ## %entry
2299; X86-NEXT:    movb {{[0-9]+}}(%esp), %al ## encoding: [0x8a,0x44,0x24,0x04]
2300; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
2301; X86-NEXT:    vfnmadd213ss %xmm2, %xmm1, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x75,0x89,0xad,0xc2]
2302; X86-NEXT:    ## xmm0 {%k1} {z} = -(xmm1 * xmm0) + xmm2
2303; X86-NEXT:    retl ## encoding: [0xc3]
2304;
2305; X64-LABEL: test_mm_maskz_fnmadd_round_ss:
2306; X64:       ## %bb.0: ## %entry
2307; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
2308; X64-NEXT:    vfnmadd213ss %xmm2, %xmm1, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x75,0x89,0xad,0xc2]
2309; X64-NEXT:    ## xmm0 {%k1} {z} = -(xmm1 * xmm0) + xmm2
2310; X64-NEXT:    retq ## encoding: [0xc3]
2311entry:
2312  %sub = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__B
2313  %0 = tail call <4 x float> @llvm.x86.avx512.maskz.vfmadd.ss(<4 x float> %__A, <4 x float> %sub, <4 x float> %__C, i8 %__U, i32 4)
2314  ret <4 x float> %0
2315}
2316
2317define <4 x float> @test_mm_mask3_fnmadd_ss(<4 x float> %__W, <4 x float> %__X, <4 x float> %__Y, i8 zeroext %__U) {
2318; X86-LABEL: test_mm_mask3_fnmadd_ss:
2319; X86:       ## %bb.0: ## %entry
2320; X86-NEXT:    movb {{[0-9]+}}(%esp), %al ## encoding: [0x8a,0x44,0x24,0x04]
2321; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
2322; X86-NEXT:    vfnmadd231ss %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0xbd,0xd1]
2323; X86-NEXT:    ## xmm2 {%k1} = -(xmm0 * xmm1) + xmm2
2324; X86-NEXT:    vmovaps %xmm2, %xmm0 ## encoding: [0xc5,0xf8,0x28,0xc2]
2325; X86-NEXT:    retl ## encoding: [0xc3]
2326;
2327; X64-LABEL: test_mm_mask3_fnmadd_ss:
2328; X64:       ## %bb.0: ## %entry
2329; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
2330; X64-NEXT:    vfnmadd231ss %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0xbd,0xd1]
2331; X64-NEXT:    ## xmm2 {%k1} = -(xmm0 * xmm1) + xmm2
2332; X64-NEXT:    vmovaps %xmm2, %xmm0 ## encoding: [0xc5,0xf8,0x28,0xc2]
2333; X64-NEXT:    retq ## encoding: [0xc3]
2334entry:
2335  %0 = extractelement <4 x float> %__W, i64 0
2336  %.rhs.i = extractelement <4 x float> %__X, i64 0
2337  %1 = fsub float -0.000000e+00, %.rhs.i
2338  %2 = extractelement <4 x float> %__Y, i64 0
2339  %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #10
2340  %4 = and i8 %__U, 1
2341  %tobool.i = icmp eq i8 %4, 0
2342  %vecext1.i = extractelement <4 x float> %__Y, i32 0
2343  %cond.i = select i1 %tobool.i, float %vecext1.i, float %3
2344  %vecins.i = insertelement <4 x float> %__Y, float %cond.i, i32 0
2345  ret <4 x float> %vecins.i
2346}
2347
2348define <4 x float> @test_mm_mask3_fnmadd_round_ss(<4 x float> %__W, <4 x float> %__X, <4 x float> %__Y, i8 zeroext %__U) {
2349; X86-LABEL: test_mm_mask3_fnmadd_round_ss:
2350; X86:       ## %bb.0: ## %entry
2351; X86-NEXT:    movb {{[0-9]+}}(%esp), %al ## encoding: [0x8a,0x44,0x24,0x04]
2352; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
2353; X86-NEXT:    vfnmadd231ss %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0xbd,0xd1]
2354; X86-NEXT:    ## xmm2 {%k1} = -(xmm0 * xmm1) + xmm2
2355; X86-NEXT:    vmovaps %xmm2, %xmm0 ## encoding: [0xc5,0xf8,0x28,0xc2]
2356; X86-NEXT:    retl ## encoding: [0xc3]
2357;
2358; X64-LABEL: test_mm_mask3_fnmadd_round_ss:
2359; X64:       ## %bb.0: ## %entry
2360; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
2361; X64-NEXT:    vfnmadd231ss %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0xbd,0xd1]
2362; X64-NEXT:    ## xmm2 {%k1} = -(xmm0 * xmm1) + xmm2
2363; X64-NEXT:    vmovaps %xmm2, %xmm0 ## encoding: [0xc5,0xf8,0x28,0xc2]
2364; X64-NEXT:    retq ## encoding: [0xc3]
2365entry:
2366  %sub = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__X
2367  %0 = tail call <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float> %__W, <4 x float> %sub, <4 x float> %__Y, i8 %__U, i32 4)
2368  ret <4 x float> %0
2369}
2370
2371define <4 x float> @test_mm_mask_fnmsub_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
2372; X86-LABEL: test_mm_mask_fnmsub_ss:
2373; X86:       ## %bb.0: ## %entry
2374; X86-NEXT:    movb {{[0-9]+}}(%esp), %al ## encoding: [0x8a,0x44,0x24,0x04]
2375; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
2376; X86-NEXT:    vfnmsub213ss %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xaf,0xc2]
2377; X86-NEXT:    ## xmm0 {%k1} = -(xmm1 * xmm0) - xmm2
2378; X86-NEXT:    retl ## encoding: [0xc3]
2379;
2380; X64-LABEL: test_mm_mask_fnmsub_ss:
2381; X64:       ## %bb.0: ## %entry
2382; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
2383; X64-NEXT:    vfnmsub213ss %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xaf,0xc2]
2384; X64-NEXT:    ## xmm0 {%k1} = -(xmm1 * xmm0) - xmm2
2385; X64-NEXT:    retq ## encoding: [0xc3]
2386entry:
2387  %0 = extractelement <4 x float> %__W, i64 0
2388  %.rhs.i = extractelement <4 x float> %__A, i64 0
2389  %1 = fsub float -0.000000e+00, %.rhs.i
2390  %.rhs7.i = extractelement <4 x float> %__B, i64 0
2391  %2 = fsub float -0.000000e+00, %.rhs7.i
2392  %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #10
2393  %4 = and i8 %__U, 1
2394  %tobool.i = icmp eq i8 %4, 0
2395  %vecext2.i = extractelement <4 x float> %__W, i32 0
2396  %cond.i = select i1 %tobool.i, float %vecext2.i, float %3
2397  %vecins.i = insertelement <4 x float> %__W, float %cond.i, i32 0
2398  ret <4 x float> %vecins.i
2399}
2400
2401define <4 x float> @test_mm_mask_fnmsub_round_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
2402; X86-LABEL: test_mm_mask_fnmsub_round_ss:
2403; X86:       ## %bb.0: ## %entry
2404; X86-NEXT:    movb {{[0-9]+}}(%esp), %al ## encoding: [0x8a,0x44,0x24,0x04]
2405; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
2406; X86-NEXT:    vfnmsub213ss %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xaf,0xc2]
2407; X86-NEXT:    ## xmm0 {%k1} = -(xmm1 * xmm0) - xmm2
2408; X86-NEXT:    retl ## encoding: [0xc3]
2409;
2410; X64-LABEL: test_mm_mask_fnmsub_round_ss:
2411; X64:       ## %bb.0: ## %entry
2412; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
2413; X64-NEXT:    vfnmsub213ss %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xaf,0xc2]
2414; X64-NEXT:    ## xmm0 {%k1} = -(xmm1 * xmm0) - xmm2
2415; X64-NEXT:    retq ## encoding: [0xc3]
2416entry:
2417  %sub = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
2418  %sub1 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__B
2419  %0 = tail call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %__W, <4 x float> %sub, <4 x float> %sub1, i8 %__U, i32 4)
2420  ret <4 x float> %0
2421}
2422
2423define <4 x float> @test_mm_maskz_fnmsub_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) {
2424; X86-LABEL: test_mm_maskz_fnmsub_ss:
2425; X86:       ## %bb.0: ## %entry
2426; X86-NEXT:    movb {{[0-9]+}}(%esp), %al ## encoding: [0x8a,0x44,0x24,0x04]
2427; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
2428; X86-NEXT:    vfnmsub213ss %xmm2, %xmm1, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x75,0x89,0xaf,0xc2]
2429; X86-NEXT:    ## xmm0 {%k1} {z} = -(xmm1 * xmm0) - xmm2
2430; X86-NEXT:    retl ## encoding: [0xc3]
2431;
2432; X64-LABEL: test_mm_maskz_fnmsub_ss:
2433; X64:       ## %bb.0: ## %entry
2434; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
2435; X64-NEXT:    vfnmsub213ss %xmm2, %xmm1, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x75,0x89,0xaf,0xc2]
2436; X64-NEXT:    ## xmm0 {%k1} {z} = -(xmm1 * xmm0) - xmm2
2437; X64-NEXT:    retq ## encoding: [0xc3]
2438entry:
2439  %0 = extractelement <4 x float> %__A, i64 0
2440  %.rhs.i = extractelement <4 x float> %__B, i64 0
2441  %1 = fsub float -0.000000e+00, %.rhs.i
2442  %.rhs5.i = extractelement <4 x float> %__C, i64 0
2443  %2 = fsub float -0.000000e+00, %.rhs5.i
2444  %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #10
2445  %4 = and i8 %__U, 1
2446  %tobool.i = icmp eq i8 %4, 0
2447  %cond.i = select i1 %tobool.i, float 0.000000e+00, float %3
2448  %vecins.i = insertelement <4 x float> %__A, float %cond.i, i32 0
2449  ret <4 x float> %vecins.i
2450}
2451
2452define <4 x float> @test_mm_maskz_fnmsub_round_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) {
2453; X86-LABEL: test_mm_maskz_fnmsub_round_ss:
2454; X86:       ## %bb.0: ## %entry
2455; X86-NEXT:    movb {{[0-9]+}}(%esp), %al ## encoding: [0x8a,0x44,0x24,0x04]
2456; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
2457; X86-NEXT:    vfnmsub213ss %xmm2, %xmm1, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x75,0x89,0xaf,0xc2]
2458; X86-NEXT:    ## xmm0 {%k1} {z} = -(xmm1 * xmm0) - xmm2
2459; X86-NEXT:    retl ## encoding: [0xc3]
2460;
2461; X64-LABEL: test_mm_maskz_fnmsub_round_ss:
2462; X64:       ## %bb.0: ## %entry
2463; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
2464; X64-NEXT:    vfnmsub213ss %xmm2, %xmm1, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x75,0x89,0xaf,0xc2]
2465; X64-NEXT:    ## xmm0 {%k1} {z} = -(xmm1 * xmm0) - xmm2
2466; X64-NEXT:    retq ## encoding: [0xc3]
2467entry:
2468  %sub = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__B
2469  %sub1 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
2470  %0 = tail call <4 x float> @llvm.x86.avx512.maskz.vfmadd.ss(<4 x float> %__A, <4 x float> %sub, <4 x float> %sub1, i8 %__U, i32 4)
2471  ret <4 x float> %0
2472}
2473
2474define <4 x float> @test_mm_mask3_fnmsub_ss(<4 x float> %__W, <4 x float> %__X, <4 x float> %__Y, i8 zeroext %__U) {
2475; X86-LABEL: test_mm_mask3_fnmsub_ss:
2476; X86:       ## %bb.0: ## %entry
2477; X86-NEXT:    movb {{[0-9]+}}(%esp), %al ## encoding: [0x8a,0x44,0x24,0x04]
2478; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
2479; X86-NEXT:    vfnmsub231ss %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0xbf,0xd1]
2480; X86-NEXT:    ## xmm2 {%k1} = -(xmm0 * xmm1) - xmm2
2481; X86-NEXT:    vmovaps %xmm2, %xmm0 ## encoding: [0xc5,0xf8,0x28,0xc2]
2482; X86-NEXT:    retl ## encoding: [0xc3]
2483;
2484; X64-LABEL: test_mm_mask3_fnmsub_ss:
2485; X64:       ## %bb.0: ## %entry
2486; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
2487; X64-NEXT:    vfnmsub231ss %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0xbf,0xd1]
2488; X64-NEXT:    ## xmm2 {%k1} = -(xmm0 * xmm1) - xmm2
2489; X64-NEXT:    vmovaps %xmm2, %xmm0 ## encoding: [0xc5,0xf8,0x28,0xc2]
2490; X64-NEXT:    retq ## encoding: [0xc3]
2491entry:
2492  %0 = extractelement <4 x float> %__W, i64 0
2493  %.rhs.i = extractelement <4 x float> %__X, i64 0
2494  %1 = fsub float -0.000000e+00, %.rhs.i
2495  %.rhs7.i = extractelement <4 x float> %__Y, i64 0
2496  %2 = fsub float -0.000000e+00, %.rhs7.i
2497  %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #10
2498  %4 = and i8 %__U, 1
2499  %tobool.i = icmp eq i8 %4, 0
2500  %vecext2.i = extractelement <4 x float> %__Y, i32 0
2501  %cond.i = select i1 %tobool.i, float %vecext2.i, float %3
2502  %vecins.i = insertelement <4 x float> %__Y, float %cond.i, i32 0
2503  ret <4 x float> %vecins.i
2504}
2505
2506define <4 x float> @test_mm_mask3_fnmsub_round_ss(<4 x float> %__W, <4 x float> %__X, <4 x float> %__Y, i8 zeroext %__U) {
2507; X86-LABEL: test_mm_mask3_fnmsub_round_ss:
2508; X86:       ## %bb.0: ## %entry
2509; X86-NEXT:    movb {{[0-9]+}}(%esp), %al ## encoding: [0x8a,0x44,0x24,0x04]
2510; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
2511; X86-NEXT:    vfnmsub231ss %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0xbf,0xd1]
2512; X86-NEXT:    ## xmm2 {%k1} = -(xmm0 * xmm1) - xmm2
2513; X86-NEXT:    vmovaps %xmm2, %xmm0 ## encoding: [0xc5,0xf8,0x28,0xc2]
2514; X86-NEXT:    retl ## encoding: [0xc3]
2515;
2516; X64-LABEL: test_mm_mask3_fnmsub_round_ss:
2517; X64:       ## %bb.0: ## %entry
2518; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
2519; X64-NEXT:    vfnmsub231ss %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0xbf,0xd1]
2520; X64-NEXT:    ## xmm2 {%k1} = -(xmm0 * xmm1) - xmm2
2521; X64-NEXT:    vmovaps %xmm2, %xmm0 ## encoding: [0xc5,0xf8,0x28,0xc2]
2522; X64-NEXT:    retq ## encoding: [0xc3]
2523entry:
2524  %sub = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__X
2525  %0 = tail call <4 x float> @llvm.x86.avx512.mask3.vfmsub.ss(<4 x float> %__W, <4 x float> %sub, <4 x float> %__Y, i8 %__U, i32 4)
2526  ret <4 x float> %0
2527}
2528
2529define <2 x double> @test_mm_mask_fmadd_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
2530; X86-LABEL: test_mm_mask_fmadd_sd:
2531; X86:       ## %bb.0: ## %entry
2532; X86-NEXT:    movb {{[0-9]+}}(%esp), %al ## encoding: [0x8a,0x44,0x24,0x04]
2533; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
2534; X86-NEXT:    vfmadd213sd %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xa9,0xc2]
2535; X86-NEXT:    ## xmm0 {%k1} = (xmm1 * xmm0) + xmm2
2536; X86-NEXT:    retl ## encoding: [0xc3]
2537;
2538; X64-LABEL: test_mm_mask_fmadd_sd:
2539; X64:       ## %bb.0: ## %entry
2540; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
2541; X64-NEXT:    vfmadd213sd %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xa9,0xc2]
2542; X64-NEXT:    ## xmm0 {%k1} = (xmm1 * xmm0) + xmm2
2543; X64-NEXT:    retq ## encoding: [0xc3]
2544entry:
2545  %0 = extractelement <2 x double> %__W, i64 0
2546  %1 = extractelement <2 x double> %__A, i64 0
2547  %2 = extractelement <2 x double> %__B, i64 0
2548  %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #10
2549  %4 = and i8 %__U, 1
2550  %tobool.i = icmp eq i8 %4, 0
2551  %vecext1.i = extractelement <2 x double> %__W, i32 0
2552  %cond.i = select i1 %tobool.i, double %vecext1.i, double %3
2553  %vecins.i = insertelement <2 x double> %__W, double %cond.i, i32 0
2554  ret <2 x double> %vecins.i
2555}
2556
2557define <2 x double> @test_mm_mask_fmadd_round_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
2558; X86-LABEL: test_mm_mask_fmadd_round_sd:
2559; X86:       ## %bb.0: ## %entry
2560; X86-NEXT:    movb {{[0-9]+}}(%esp), %al ## encoding: [0x8a,0x44,0x24,0x04]
2561; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
2562; X86-NEXT:    vfmadd213sd %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xa9,0xc2]
2563; X86-NEXT:    ## xmm0 {%k1} = (xmm1 * xmm0) + xmm2
2564; X86-NEXT:    retl ## encoding: [0xc3]
2565;
2566; X64-LABEL: test_mm_mask_fmadd_round_sd:
2567; X64:       ## %bb.0: ## %entry
2568; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
2569; X64-NEXT:    vfmadd213sd %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xa9,0xc2]
2570; X64-NEXT:    ## xmm0 {%k1} = (xmm1 * xmm0) + xmm2
2571; X64-NEXT:    retq ## encoding: [0xc3]
2572entry:
2573  %0 = tail call <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double> %__W, <2 x double> %__A, <2 x double> %__B, i8 %__U, i32 4)
2574  ret <2 x double> %0
2575}
2576
2577declare <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32) #1
2578
2579define <2 x double> @test_mm_maskz_fmadd_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) {
2580; X86-LABEL: test_mm_maskz_fmadd_sd:
2581; X86:       ## %bb.0: ## %entry
2582; X86-NEXT:    movb {{[0-9]+}}(%esp), %al ## encoding: [0x8a,0x44,0x24,0x04]
2583; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
2584; X86-NEXT:    vfmadd213sd %xmm2, %xmm1, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xf5,0x89,0xa9,0xc2]
2585; X86-NEXT:    ## xmm0 {%k1} {z} = (xmm1 * xmm0) + xmm2
2586; X86-NEXT:    retl ## encoding: [0xc3]
2587;
2588; X64-LABEL: test_mm_maskz_fmadd_sd:
2589; X64:       ## %bb.0: ## %entry
2590; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
2591; X64-NEXT:    vfmadd213sd %xmm2, %xmm1, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xf5,0x89,0xa9,0xc2]
2592; X64-NEXT:    ## xmm0 {%k1} {z} = (xmm1 * xmm0) + xmm2
2593; X64-NEXT:    retq ## encoding: [0xc3]
2594entry:
2595  %0 = extractelement <2 x double> %__A, i64 0
2596  %1 = extractelement <2 x double> %__B, i64 0
2597  %2 = extractelement <2 x double> %__C, i64 0
2598  %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #10
2599  %4 = and i8 %__U, 1
2600  %tobool.i = icmp eq i8 %4, 0
2601  %cond.i = select i1 %tobool.i, double 0.000000e+00, double %3
2602  %vecins.i = insertelement <2 x double> %__A, double %cond.i, i32 0
2603  ret <2 x double> %vecins.i
2604}
2605
2606define <2 x double> @test_mm_maskz_fmadd_round_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) {
2607; X86-LABEL: test_mm_maskz_fmadd_round_sd:
2608; X86:       ## %bb.0: ## %entry
2609; X86-NEXT:    movb {{[0-9]+}}(%esp), %al ## encoding: [0x8a,0x44,0x24,0x04]
2610; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
2611; X86-NEXT:    vfmadd213sd %xmm2, %xmm1, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xf5,0x89,0xa9,0xc2]
2612; X86-NEXT:    ## xmm0 {%k1} {z} = (xmm1 * xmm0) + xmm2
2613; X86-NEXT:    retl ## encoding: [0xc3]
2614;
2615; X64-LABEL: test_mm_maskz_fmadd_round_sd:
2616; X64:       ## %bb.0: ## %entry
2617; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
2618; X64-NEXT:    vfmadd213sd %xmm2, %xmm1, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xf5,0x89,0xa9,0xc2]
2619; X64-NEXT:    ## xmm0 {%k1} {z} = (xmm1 * xmm0) + xmm2
2620; X64-NEXT:    retq ## encoding: [0xc3]
2621entry:
2622  %0 = tail call <2 x double> @llvm.x86.avx512.maskz.vfmadd.sd(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C, i8 %__U, i32 4)
2623  ret <2 x double> %0
2624}
2625
2626declare <2 x double> @llvm.x86.avx512.maskz.vfmadd.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32) #1
2627
2628define <2 x double> @test_mm_mask3_fmadd_sd(<2 x double> %__W, <2 x double> %__X, <2 x double> %__Y, i8 zeroext %__U) {
2629; X86-LABEL: test_mm_mask3_fmadd_sd:
2630; X86:       ## %bb.0: ## %entry
2631; X86-NEXT:    movb {{[0-9]+}}(%esp), %al ## encoding: [0x8a,0x44,0x24,0x04]
2632; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
2633; X86-NEXT:    vfmadd231sd %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0xb9,0xd1]
2634; X86-NEXT:    ## xmm2 {%k1} = (xmm0 * xmm1) + xmm2
2635; X86-NEXT:    vmovapd %xmm2, %xmm0 ## encoding: [0xc5,0xf9,0x28,0xc2]
2636; X86-NEXT:    retl ## encoding: [0xc3]
2637;
2638; X64-LABEL: test_mm_mask3_fmadd_sd:
2639; X64:       ## %bb.0: ## %entry
2640; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
2641; X64-NEXT:    vfmadd231sd %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0xb9,0xd1]
2642; X64-NEXT:    ## xmm2 {%k1} = (xmm0 * xmm1) + xmm2
2643; X64-NEXT:    vmovapd %xmm2, %xmm0 ## encoding: [0xc5,0xf9,0x28,0xc2]
2644; X64-NEXT:    retq ## encoding: [0xc3]
2645entry:
2646  %0 = extractelement <2 x double> %__W, i64 0
2647  %1 = extractelement <2 x double> %__X, i64 0
2648  %2 = extractelement <2 x double> %__Y, i64 0
2649  %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #10
2650  %4 = and i8 %__U, 1
2651  %tobool.i = icmp eq i8 %4, 0
2652  %vecext1.i = extractelement <2 x double> %__Y, i32 0
2653  %cond.i = select i1 %tobool.i, double %vecext1.i, double %3
2654  %vecins.i = insertelement <2 x double> %__Y, double %cond.i, i32 0
2655  ret <2 x double> %vecins.i
2656}
2657
2658define <2 x double> @test_mm_mask3_fmadd_round_sd(<2 x double> %__W, <2 x double> %__X, <2 x double> %__Y, i8 zeroext %__U) {
2659; X86-LABEL: test_mm_mask3_fmadd_round_sd:
2660; X86:       ## %bb.0: ## %entry
2661; X86-NEXT:    movb {{[0-9]+}}(%esp), %al ## encoding: [0x8a,0x44,0x24,0x04]
2662; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
2663; X86-NEXT:    vfmadd231sd %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0xb9,0xd1]
2664; X86-NEXT:    ## xmm2 {%k1} = (xmm0 * xmm1) + xmm2
2665; X86-NEXT:    vmovapd %xmm2, %xmm0 ## encoding: [0xc5,0xf9,0x28,0xc2]
2666; X86-NEXT:    retl ## encoding: [0xc3]
2667;
2668; X64-LABEL: test_mm_mask3_fmadd_round_sd:
2669; X64:       ## %bb.0: ## %entry
2670; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
2671; X64-NEXT:    vfmadd231sd %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0xb9,0xd1]
2672; X64-NEXT:    ## xmm2 {%k1} = (xmm0 * xmm1) + xmm2
2673; X64-NEXT:    vmovapd %xmm2, %xmm0 ## encoding: [0xc5,0xf9,0x28,0xc2]
2674; X64-NEXT:    retq ## encoding: [0xc3]
2675entry:
2676  %0 = tail call <2 x double> @llvm.x86.avx512.mask3.vfmadd.sd(<2 x double> %__W, <2 x double> %__X, <2 x double> %__Y, i8 %__U, i32 4)
2677  ret <2 x double> %0
2678}
2679
2680declare <2 x double> @llvm.x86.avx512.mask3.vfmadd.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32) #1
2681
2682define <2 x double> @test_mm_mask_fmsub_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
2683; X86-LABEL: test_mm_mask_fmsub_sd:
2684; X86:       ## %bb.0: ## %entry
2685; X86-NEXT:    movb {{[0-9]+}}(%esp), %al ## encoding: [0x8a,0x44,0x24,0x04]
2686; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
2687; X86-NEXT:    vfmsub213sd %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xab,0xc2]
2688; X86-NEXT:    ## xmm0 {%k1} = (xmm1 * xmm0) - xmm2
2689; X86-NEXT:    retl ## encoding: [0xc3]
2690;
2691; X64-LABEL: test_mm_mask_fmsub_sd:
2692; X64:       ## %bb.0: ## %entry
2693; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
2694; X64-NEXT:    vfmsub213sd %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xab,0xc2]
2695; X64-NEXT:    ## xmm0 {%k1} = (xmm1 * xmm0) - xmm2
2696; X64-NEXT:    retq ## encoding: [0xc3]
2697entry:
2698  %0 = extractelement <2 x double> %__W, i64 0
2699  %1 = extractelement <2 x double> %__A, i64 0
2700  %.rhs.i = extractelement <2 x double> %__B, i64 0
2701  %2 = fsub double -0.000000e+00, %.rhs.i
2702  %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #10
2703  %4 = and i8 %__U, 1
2704  %tobool.i = icmp eq i8 %4, 0
2705  %vecext1.i = extractelement <2 x double> %__W, i32 0
2706  %cond.i = select i1 %tobool.i, double %vecext1.i, double %3
2707  %vecins.i = insertelement <2 x double> %__W, double %cond.i, i32 0
2708  ret <2 x double> %vecins.i
2709}
2710
2711define <2 x double> @test_mm_mask_fmsub_round_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
2712; X86-LABEL: test_mm_mask_fmsub_round_sd:
2713; X86:       ## %bb.0: ## %entry
2714; X86-NEXT:    movb {{[0-9]+}}(%esp), %al ## encoding: [0x8a,0x44,0x24,0x04]
2715; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
2716; X86-NEXT:    vfmsub213sd %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xab,0xc2]
2717; X86-NEXT:    ## xmm0 {%k1} = (xmm1 * xmm0) - xmm2
2718; X86-NEXT:    retl ## encoding: [0xc3]
2719;
2720; X64-LABEL: test_mm_mask_fmsub_round_sd:
2721; X64:       ## %bb.0: ## %entry
2722; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
2723; X64-NEXT:    vfmsub213sd %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xab,0xc2]
2724; X64-NEXT:    ## xmm0 {%k1} = (xmm1 * xmm0) - xmm2
2725; X64-NEXT:    retq ## encoding: [0xc3]
2726entry:
2727  %sub = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__B
2728  %0 = tail call <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double> %__W, <2 x double> %__A, <2 x double> %sub, i8 %__U, i32 4)
2729  ret <2 x double> %0
2730}
2731
2732define <2 x double> @test_mm_maskz_fmsub_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) {
2733; X86-LABEL: test_mm_maskz_fmsub_sd:
2734; X86:       ## %bb.0: ## %entry
2735; X86-NEXT:    movb {{[0-9]+}}(%esp), %al ## encoding: [0x8a,0x44,0x24,0x04]
2736; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
2737; X86-NEXT:    vfmsub213sd %xmm2, %xmm1, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xf5,0x89,0xab,0xc2]
2738; X86-NEXT:    ## xmm0 {%k1} {z} = (xmm1 * xmm0) - xmm2
2739; X86-NEXT:    retl ## encoding: [0xc3]
2740;
2741; X64-LABEL: test_mm_maskz_fmsub_sd:
2742; X64:       ## %bb.0: ## %entry
2743; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
2744; X64-NEXT:    vfmsub213sd %xmm2, %xmm1, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xf5,0x89,0xab,0xc2]
2745; X64-NEXT:    ## xmm0 {%k1} {z} = (xmm1 * xmm0) - xmm2
2746; X64-NEXT:    retq ## encoding: [0xc3]
2747entry:
2748  %0 = extractelement <2 x double> %__A, i64 0
2749  %1 = extractelement <2 x double> %__B, i64 0
2750  %.rhs.i = extractelement <2 x double> %__C, i64 0
2751  %2 = fsub double -0.000000e+00, %.rhs.i
2752  %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #10
2753  %4 = and i8 %__U, 1
2754  %tobool.i = icmp eq i8 %4, 0
2755  %cond.i = select i1 %tobool.i, double 0.000000e+00, double %3
2756  %vecins.i = insertelement <2 x double> %__A, double %cond.i, i32 0
2757  ret <2 x double> %vecins.i
2758}
2759
2760define <2 x double> @test_mm_maskz_fmsub_round_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) {
2761; X86-LABEL: test_mm_maskz_fmsub_round_sd:
2762; X86:       ## %bb.0: ## %entry
2763; X86-NEXT:    movb {{[0-9]+}}(%esp), %al ## encoding: [0x8a,0x44,0x24,0x04]
2764; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
2765; X86-NEXT:    vfmsub213sd %xmm2, %xmm1, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xf5,0x89,0xab,0xc2]
2766; X86-NEXT:    ## xmm0 {%k1} {z} = (xmm1 * xmm0) - xmm2
2767; X86-NEXT:    retl ## encoding: [0xc3]
2768;
2769; X64-LABEL: test_mm_maskz_fmsub_round_sd:
2770; X64:       ## %bb.0: ## %entry
2771; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
2772; X64-NEXT:    vfmsub213sd %xmm2, %xmm1, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xf5,0x89,0xab,0xc2]
2773; X64-NEXT:    ## xmm0 {%k1} {z} = (xmm1 * xmm0) - xmm2
2774; X64-NEXT:    retq ## encoding: [0xc3]
2775entry:
2776  %sub = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__C
2777  %0 = tail call <2 x double> @llvm.x86.avx512.maskz.vfmadd.sd(<2 x double> %__A, <2 x double> %__B, <2 x double> %sub, i8 %__U, i32 4)
2778  ret <2 x double> %0
2779}
2780
2781define <2 x double> @test_mm_mask3_fmsub_sd(<2 x double> %__W, <2 x double> %__X, <2 x double> %__Y, i8 zeroext %__U) {
2782; X86-LABEL: test_mm_mask3_fmsub_sd:
2783; X86:       ## %bb.0: ## %entry
2784; X86-NEXT:    movb {{[0-9]+}}(%esp), %al ## encoding: [0x8a,0x44,0x24,0x04]
2785; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
2786; X86-NEXT:    vfmsub231sd %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0xbb,0xd1]
2787; X86-NEXT:    ## xmm2 {%k1} = (xmm0 * xmm1) - xmm2
2788; X86-NEXT:    vmovapd %xmm2, %xmm0 ## encoding: [0xc5,0xf9,0x28,0xc2]
2789; X86-NEXT:    retl ## encoding: [0xc3]
2790;
2791; X64-LABEL: test_mm_mask3_fmsub_sd:
2792; X64:       ## %bb.0: ## %entry
2793; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
2794; X64-NEXT:    vfmsub231sd %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0xbb,0xd1]
2795; X64-NEXT:    ## xmm2 {%k1} = (xmm0 * xmm1) - xmm2
2796; X64-NEXT:    vmovapd %xmm2, %xmm0 ## encoding: [0xc5,0xf9,0x28,0xc2]
2797; X64-NEXT:    retq ## encoding: [0xc3]
2798entry:
2799  %0 = extractelement <2 x double> %__W, i64 0
2800  %1 = extractelement <2 x double> %__X, i64 0
2801  %.rhs.i = extractelement <2 x double> %__Y, i64 0
2802  %2 = fsub double -0.000000e+00, %.rhs.i
2803  %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #10
2804  %4 = and i8 %__U, 1
2805  %tobool.i = icmp eq i8 %4, 0
2806  %vecext1.i = extractelement <2 x double> %__Y, i32 0
2807  %cond.i = select i1 %tobool.i, double %vecext1.i, double %3
2808  %vecins.i = insertelement <2 x double> %__Y, double %cond.i, i32 0
2809  ret <2 x double> %vecins.i
2810}
2811
2812define <2 x double> @test_mm_mask3_fmsub_round_sd(<2 x double> %__W, <2 x double> %__X, <2 x double> %__Y, i8 zeroext %__U) {
2813; X86-LABEL: test_mm_mask3_fmsub_round_sd:
2814; X86:       ## %bb.0: ## %entry
2815; X86-NEXT:    movb {{[0-9]+}}(%esp), %al ## encoding: [0x8a,0x44,0x24,0x04]
2816; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
2817; X86-NEXT:    vfmsub231sd %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0xbb,0xd1]
2818; X86-NEXT:    ## xmm2 {%k1} = (xmm0 * xmm1) - xmm2
2819; X86-NEXT:    vmovapd %xmm2, %xmm0 ## encoding: [0xc5,0xf9,0x28,0xc2]
2820; X86-NEXT:    retl ## encoding: [0xc3]
2821;
2822; X64-LABEL: test_mm_mask3_fmsub_round_sd:
2823; X64:       ## %bb.0: ## %entry
2824; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
2825; X64-NEXT:    vfmsub231sd %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0xbb,0xd1]
2826; X64-NEXT:    ## xmm2 {%k1} = (xmm0 * xmm1) - xmm2
2827; X64-NEXT:    vmovapd %xmm2, %xmm0 ## encoding: [0xc5,0xf9,0x28,0xc2]
2828; X64-NEXT:    retq ## encoding: [0xc3]
2829entry:
2830  %0 = tail call <2 x double> @llvm.x86.avx512.mask3.vfmsub.sd(<2 x double> %__W, <2 x double> %__X, <2 x double> %__Y, i8 %__U, i32 4)
2831  ret <2 x double> %0
2832}
2833
2834declare <2 x double> @llvm.x86.avx512.mask3.vfmsub.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32) #1
2835
2836define <2 x double> @test_mm_mask_fnmadd_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
2837; X86-LABEL: test_mm_mask_fnmadd_sd:
2838; X86:       ## %bb.0: ## %entry
2839; X86-NEXT:    movb {{[0-9]+}}(%esp), %al ## encoding: [0x8a,0x44,0x24,0x04]
2840; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
2841; X86-NEXT:    vfnmadd213sd %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xad,0xc2]
2842; X86-NEXT:    ## xmm0 {%k1} = -(xmm1 * xmm0) + xmm2
2843; X86-NEXT:    retl ## encoding: [0xc3]
2844;
2845; X64-LABEL: test_mm_mask_fnmadd_sd:
2846; X64:       ## %bb.0: ## %entry
2847; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
2848; X64-NEXT:    vfnmadd213sd %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xad,0xc2]
2849; X64-NEXT:    ## xmm0 {%k1} = -(xmm1 * xmm0) + xmm2
2850; X64-NEXT:    retq ## encoding: [0xc3]
2851entry:
2852  %0 = extractelement <2 x double> %__W, i64 0
2853  %.rhs.i = extractelement <2 x double> %__A, i64 0
2854  %1 = fsub double -0.000000e+00, %.rhs.i
2855  %2 = extractelement <2 x double> %__B, i64 0
2856  %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #10
2857  %4 = and i8 %__U, 1
2858  %tobool.i = icmp eq i8 %4, 0
2859  %vecext1.i = extractelement <2 x double> %__W, i32 0
2860  %cond.i = select i1 %tobool.i, double %vecext1.i, double %3
2861  %vecins.i = insertelement <2 x double> %__W, double %cond.i, i32 0
2862  ret <2 x double> %vecins.i
2863}
2864
2865define <2 x double> @test_mm_mask_fnmadd_round_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
2866; X86-LABEL: test_mm_mask_fnmadd_round_sd:
2867; X86:       ## %bb.0: ## %entry
2868; X86-NEXT:    movb {{[0-9]+}}(%esp), %al ## encoding: [0x8a,0x44,0x24,0x04]
2869; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
2870; X86-NEXT:    vfnmadd213sd %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xad,0xc2]
2871; X86-NEXT:    ## xmm0 {%k1} = -(xmm1 * xmm0) + xmm2
2872; X86-NEXT:    retl ## encoding: [0xc3]
2873;
2874; X64-LABEL: test_mm_mask_fnmadd_round_sd:
2875; X64:       ## %bb.0: ## %entry
2876; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
2877; X64-NEXT:    vfnmadd213sd %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xad,0xc2]
2878; X64-NEXT:    ## xmm0 {%k1} = -(xmm1 * xmm0) + xmm2
2879; X64-NEXT:    retq ## encoding: [0xc3]
2880entry:
2881  %sub = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__A
2882  %0 = tail call <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double> %__W, <2 x double> %sub, <2 x double> %__B, i8 %__U, i32 4)
2883  ret <2 x double> %0
2884}
2885
2886define <2 x double> @test_mm_maskz_fnmadd_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) {
2887; X86-LABEL: test_mm_maskz_fnmadd_sd:
2888; X86:       ## %bb.0: ## %entry
2889; X86-NEXT:    movb {{[0-9]+}}(%esp), %al ## encoding: [0x8a,0x44,0x24,0x04]
2890; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
2891; X86-NEXT:    vfnmadd213sd %xmm2, %xmm1, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xf5,0x89,0xad,0xc2]
2892; X86-NEXT:    ## xmm0 {%k1} {z} = -(xmm1 * xmm0) + xmm2
2893; X86-NEXT:    retl ## encoding: [0xc3]
2894;
2895; X64-LABEL: test_mm_maskz_fnmadd_sd:
2896; X64:       ## %bb.0: ## %entry
2897; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
2898; X64-NEXT:    vfnmadd213sd %xmm2, %xmm1, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xf5,0x89,0xad,0xc2]
2899; X64-NEXT:    ## xmm0 {%k1} {z} = -(xmm1 * xmm0) + xmm2
2900; X64-NEXT:    retq ## encoding: [0xc3]
2901entry:
2902  %0 = extractelement <2 x double> %__A, i64 0
2903  %.rhs.i = extractelement <2 x double> %__B, i64 0
2904  %1 = fsub double -0.000000e+00, %.rhs.i
2905  %2 = extractelement <2 x double> %__C, i64 0
2906  %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #10
2907  %4 = and i8 %__U, 1
2908  %tobool.i = icmp eq i8 %4, 0
2909  %cond.i = select i1 %tobool.i, double 0.000000e+00, double %3
2910  %vecins.i = insertelement <2 x double> %__A, double %cond.i, i32 0
2911  ret <2 x double> %vecins.i
2912}
2913
2914define <2 x double> @test_mm_maskz_fnmadd_round_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) {
2915; X86-LABEL: test_mm_maskz_fnmadd_round_sd:
2916; X86:       ## %bb.0: ## %entry
2917; X86-NEXT:    movb {{[0-9]+}}(%esp), %al ## encoding: [0x8a,0x44,0x24,0x04]
2918; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
2919; X86-NEXT:    vfnmadd213sd %xmm2, %xmm1, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xf5,0x89,0xad,0xc2]
2920; X86-NEXT:    ## xmm0 {%k1} {z} = -(xmm1 * xmm0) + xmm2
2921; X86-NEXT:    retl ## encoding: [0xc3]
2922;
2923; X64-LABEL: test_mm_maskz_fnmadd_round_sd:
2924; X64:       ## %bb.0: ## %entry
2925; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
2926; X64-NEXT:    vfnmadd213sd %xmm2, %xmm1, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xf5,0x89,0xad,0xc2]
2927; X64-NEXT:    ## xmm0 {%k1} {z} = -(xmm1 * xmm0) + xmm2
2928; X64-NEXT:    retq ## encoding: [0xc3]
2929entry:
2930  %sub = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__B
2931  %0 = tail call <2 x double> @llvm.x86.avx512.maskz.vfmadd.sd(<2 x double> %__A, <2 x double> %sub, <2 x double> %__C, i8 %__U, i32 4)
2932  ret <2 x double> %0
2933}
2934
2935define <2 x double> @test_mm_mask3_fnmadd_sd(<2 x double> %__W, <2 x double> %__X, <2 x double> %__Y, i8 zeroext %__U) {
2936; X86-LABEL: test_mm_mask3_fnmadd_sd:
2937; X86:       ## %bb.0: ## %entry
2938; X86-NEXT:    movb {{[0-9]+}}(%esp), %al ## encoding: [0x8a,0x44,0x24,0x04]
2939; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
2940; X86-NEXT:    vfnmadd231sd %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0xbd,0xd1]
2941; X86-NEXT:    ## xmm2 {%k1} = -(xmm0 * xmm1) + xmm2
2942; X86-NEXT:    vmovapd %xmm2, %xmm0 ## encoding: [0xc5,0xf9,0x28,0xc2]
2943; X86-NEXT:    retl ## encoding: [0xc3]
2944;
2945; X64-LABEL: test_mm_mask3_fnmadd_sd:
2946; X64:       ## %bb.0: ## %entry
2947; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
2948; X64-NEXT:    vfnmadd231sd %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0xbd,0xd1]
2949; X64-NEXT:    ## xmm2 {%k1} = -(xmm0 * xmm1) + xmm2
2950; X64-NEXT:    vmovapd %xmm2, %xmm0 ## encoding: [0xc5,0xf9,0x28,0xc2]
2951; X64-NEXT:    retq ## encoding: [0xc3]
2952entry:
2953  %0 = extractelement <2 x double> %__W, i64 0
2954  %.rhs.i = extractelement <2 x double> %__X, i64 0
2955  %1 = fsub double -0.000000e+00, %.rhs.i
2956  %2 = extractelement <2 x double> %__Y, i64 0
2957  %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #10
2958  %4 = and i8 %__U, 1
2959  %tobool.i = icmp eq i8 %4, 0
2960  %vecext1.i = extractelement <2 x double> %__Y, i32 0
2961  %cond.i = select i1 %tobool.i, double %vecext1.i, double %3
2962  %vecins.i = insertelement <2 x double> %__Y, double %cond.i, i32 0
2963  ret <2 x double> %vecins.i
2964}
2965
2966define <2 x double> @test_mm_mask3_fnmadd_round_sd(<2 x double> %__W, <2 x double> %__X, <2 x double> %__Y, i8 zeroext %__U) {
2967; X86-LABEL: test_mm_mask3_fnmadd_round_sd:
2968; X86:       ## %bb.0: ## %entry
2969; X86-NEXT:    movb {{[0-9]+}}(%esp), %al ## encoding: [0x8a,0x44,0x24,0x04]
2970; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
2971; X86-NEXT:    vfnmadd231sd %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0xbd,0xd1]
2972; X86-NEXT:    ## xmm2 {%k1} = -(xmm0 * xmm1) + xmm2
2973; X86-NEXT:    vmovapd %xmm2, %xmm0 ## encoding: [0xc5,0xf9,0x28,0xc2]
2974; X86-NEXT:    retl ## encoding: [0xc3]
2975;
2976; X64-LABEL: test_mm_mask3_fnmadd_round_sd:
2977; X64:       ## %bb.0: ## %entry
2978; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
2979; X64-NEXT:    vfnmadd231sd %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0xbd,0xd1]
2980; X64-NEXT:    ## xmm2 {%k1} = -(xmm0 * xmm1) + xmm2
2981; X64-NEXT:    vmovapd %xmm2, %xmm0 ## encoding: [0xc5,0xf9,0x28,0xc2]
2982; X64-NEXT:    retq ## encoding: [0xc3]
2983entry:
2984  %sub = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__X
2985  %0 = tail call <2 x double> @llvm.x86.avx512.mask3.vfmadd.sd(<2 x double> %__W, <2 x double> %sub, <2 x double> %__Y, i8 %__U, i32 4)
2986  ret <2 x double> %0
2987}
2988
2989define <2 x double> @test_mm_mask_fnmsub_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
2990; X86-LABEL: test_mm_mask_fnmsub_sd:
2991; X86:       ## %bb.0: ## %entry
2992; X86-NEXT:    movb {{[0-9]+}}(%esp), %al ## encoding: [0x8a,0x44,0x24,0x04]
2993; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
2994; X86-NEXT:    vfnmsub213sd %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xaf,0xc2]
2995; X86-NEXT:    ## xmm0 {%k1} = -(xmm1 * xmm0) - xmm2
2996; X86-NEXT:    retl ## encoding: [0xc3]
2997;
2998; X64-LABEL: test_mm_mask_fnmsub_sd:
2999; X64:       ## %bb.0: ## %entry
3000; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
3001; X64-NEXT:    vfnmsub213sd %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xaf,0xc2]
3002; X64-NEXT:    ## xmm0 {%k1} = -(xmm1 * xmm0) - xmm2
3003; X64-NEXT:    retq ## encoding: [0xc3]
3004entry:
3005  %0 = extractelement <2 x double> %__W, i64 0
3006  %.rhs.i = extractelement <2 x double> %__A, i64 0
3007  %1 = fsub double -0.000000e+00, %.rhs.i
3008  %.rhs7.i = extractelement <2 x double> %__B, i64 0
3009  %2 = fsub double -0.000000e+00, %.rhs7.i
3010  %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #10
3011  %4 = and i8 %__U, 1
3012  %tobool.i = icmp eq i8 %4, 0
3013  %vecext2.i = extractelement <2 x double> %__W, i32 0
3014  %cond.i = select i1 %tobool.i, double %vecext2.i, double %3
3015  %vecins.i = insertelement <2 x double> %__W, double %cond.i, i32 0
3016  ret <2 x double> %vecins.i
3017}
3018
3019define <2 x double> @test_mm_mask_fnmsub_round_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
3020; X86-LABEL: test_mm_mask_fnmsub_round_sd:
3021; X86:       ## %bb.0: ## %entry
3022; X86-NEXT:    movb {{[0-9]+}}(%esp), %al ## encoding: [0x8a,0x44,0x24,0x04]
3023; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
3024; X86-NEXT:    vfnmsub213sd %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xaf,0xc2]
3025; X86-NEXT:    ## xmm0 {%k1} = -(xmm1 * xmm0) - xmm2
3026; X86-NEXT:    retl ## encoding: [0xc3]
3027;
3028; X64-LABEL: test_mm_mask_fnmsub_round_sd:
3029; X64:       ## %bb.0: ## %entry
3030; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
3031; X64-NEXT:    vfnmsub213sd %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xaf,0xc2]
3032; X64-NEXT:    ## xmm0 {%k1} = -(xmm1 * xmm0) - xmm2
3033; X64-NEXT:    retq ## encoding: [0xc3]
3034entry:
3035  %sub = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__A
3036  %sub1 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__B
3037  %0 = tail call <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double> %__W, <2 x double> %sub, <2 x double> %sub1, i8 %__U, i32 4)
3038  ret <2 x double> %0
3039}
3040
3041define <2 x double> @test_mm_maskz_fnmsub_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) {
3042; X86-LABEL: test_mm_maskz_fnmsub_sd:
3043; X86:       ## %bb.0: ## %entry
3044; X86-NEXT:    movb {{[0-9]+}}(%esp), %al ## encoding: [0x8a,0x44,0x24,0x04]
3045; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
3046; X86-NEXT:    vfnmsub213sd %xmm2, %xmm1, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xf5,0x89,0xaf,0xc2]
3047; X86-NEXT:    ## xmm0 {%k1} {z} = -(xmm1 * xmm0) - xmm2
3048; X86-NEXT:    retl ## encoding: [0xc3]
3049;
3050; X64-LABEL: test_mm_maskz_fnmsub_sd:
3051; X64:       ## %bb.0: ## %entry
3052; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
3053; X64-NEXT:    vfnmsub213sd %xmm2, %xmm1, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xf5,0x89,0xaf,0xc2]
3054; X64-NEXT:    ## xmm0 {%k1} {z} = -(xmm1 * xmm0) - xmm2
3055; X64-NEXT:    retq ## encoding: [0xc3]
3056entry:
3057  %0 = extractelement <2 x double> %__A, i64 0
3058  %.rhs.i = extractelement <2 x double> %__B, i64 0
3059  %1 = fsub double -0.000000e+00, %.rhs.i
3060  %.rhs5.i = extractelement <2 x double> %__C, i64 0
3061  %2 = fsub double -0.000000e+00, %.rhs5.i
3062  %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #10
3063  %4 = and i8 %__U, 1
3064  %tobool.i = icmp eq i8 %4, 0
3065  %cond.i = select i1 %tobool.i, double 0.000000e+00, double %3
3066  %vecins.i = insertelement <2 x double> %__A, double %cond.i, i32 0
3067  ret <2 x double> %vecins.i
3068}
3069
3070define <2 x double> @test_mm_maskz_fnmsub_round_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) {
3071; X86-LABEL: test_mm_maskz_fnmsub_round_sd:
3072; X86:       ## %bb.0: ## %entry
3073; X86-NEXT:    movb {{[0-9]+}}(%esp), %al ## encoding: [0x8a,0x44,0x24,0x04]
3074; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
3075; X86-NEXT:    vfnmsub213sd %xmm2, %xmm1, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xf5,0x89,0xaf,0xc2]
3076; X86-NEXT:    ## xmm0 {%k1} {z} = -(xmm1 * xmm0) - xmm2
3077; X86-NEXT:    retl ## encoding: [0xc3]
3078;
3079; X64-LABEL: test_mm_maskz_fnmsub_round_sd:
3080; X64:       ## %bb.0: ## %entry
3081; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
3082; X64-NEXT:    vfnmsub213sd %xmm2, %xmm1, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xf5,0x89,0xaf,0xc2]
3083; X64-NEXT:    ## xmm0 {%k1} {z} = -(xmm1 * xmm0) - xmm2
3084; X64-NEXT:    retq ## encoding: [0xc3]
3085entry:
3086  %sub = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__B
3087  %sub1 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__C
3088  %0 = tail call <2 x double> @llvm.x86.avx512.maskz.vfmadd.sd(<2 x double> %__A, <2 x double> %sub, <2 x double> %sub1, i8 %__U, i32 4)
3089  ret <2 x double> %0
3090}
3091
3092define <2 x double> @test_mm_mask3_fnmsub_sd(<2 x double> %__W, <2 x double> %__X, <2 x double> %__Y, i8 zeroext %__U) {
3093; X86-LABEL: test_mm_mask3_fnmsub_sd:
3094; X86:       ## %bb.0: ## %entry
3095; X86-NEXT:    movb {{[0-9]+}}(%esp), %al ## encoding: [0x8a,0x44,0x24,0x04]
3096; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
3097; X86-NEXT:    vfnmsub231sd %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0xbf,0xd1]
3098; X86-NEXT:    ## xmm2 {%k1} = -(xmm0 * xmm1) - xmm2
3099; X86-NEXT:    vmovapd %xmm2, %xmm0 ## encoding: [0xc5,0xf9,0x28,0xc2]
3100; X86-NEXT:    retl ## encoding: [0xc3]
3101;
3102; X64-LABEL: test_mm_mask3_fnmsub_sd:
3103; X64:       ## %bb.0: ## %entry
3104; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
3105; X64-NEXT:    vfnmsub231sd %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0xbf,0xd1]
3106; X64-NEXT:    ## xmm2 {%k1} = -(xmm0 * xmm1) - xmm2
3107; X64-NEXT:    vmovapd %xmm2, %xmm0 ## encoding: [0xc5,0xf9,0x28,0xc2]
3108; X64-NEXT:    retq ## encoding: [0xc3]
3109entry:
3110  %0 = extractelement <2 x double> %__W, i64 0
3111  %.rhs.i = extractelement <2 x double> %__X, i64 0
3112  %1 = fsub double -0.000000e+00, %.rhs.i
3113  %.rhs7.i = extractelement <2 x double> %__Y, i64 0
3114  %2 = fsub double -0.000000e+00, %.rhs7.i
3115  %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #10
3116  %4 = and i8 %__U, 1
3117  %tobool.i = icmp eq i8 %4, 0
3118  %vecext2.i = extractelement <2 x double> %__Y, i32 0
3119  %cond.i = select i1 %tobool.i, double %vecext2.i, double %3
3120  %vecins.i = insertelement <2 x double> %__Y, double %cond.i, i32 0
3121  ret <2 x double> %vecins.i
3122}
3123
3124define <2 x double> @test_mm_mask3_fnmsub_round_sd(<2 x double> %__W, <2 x double> %__X, <2 x double> %__Y, i8 zeroext %__U) {
3125; X86-LABEL: test_mm_mask3_fnmsub_round_sd:
3126; X86:       ## %bb.0: ## %entry
3127; X86-NEXT:    movb {{[0-9]+}}(%esp), %al ## encoding: [0x8a,0x44,0x24,0x04]
3128; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
3129; X86-NEXT:    vfnmsub231sd %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0xbf,0xd1]
3130; X86-NEXT:    ## xmm2 {%k1} = -(xmm0 * xmm1) - xmm2
3131; X86-NEXT:    vmovapd %xmm2, %xmm0 ## encoding: [0xc5,0xf9,0x28,0xc2]
3132; X86-NEXT:    retl ## encoding: [0xc3]
3133;
3134; X64-LABEL: test_mm_mask3_fnmsub_round_sd:
3135; X64:       ## %bb.0: ## %entry
3136; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
3137; X64-NEXT:    vfnmsub231sd %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0xbf,0xd1]
3138; X64-NEXT:    ## xmm2 {%k1} = -(xmm0 * xmm1) - xmm2
3139; X64-NEXT:    vmovapd %xmm2, %xmm0 ## encoding: [0xc5,0xf9,0x28,0xc2]
3140; X64-NEXT:    retq ## encoding: [0xc3]
3141entry:
3142  %sub = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__X
3143  %0 = tail call <2 x double> @llvm.x86.avx512.mask3.vfmsub.sd(<2 x double> %__W, <2 x double> %sub, <2 x double> %__Y, i8 %__U, i32 4)
3144  ret <2 x double> %0
3145}
3146
3147define <4 x float> @test_mm_mask_add_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
3148; X86-LABEL: test_mm_mask_add_ss:
3149; X86:       ## %bb.0: ## %entry
3150; X86-NEXT:    movb {{[0-9]+}}(%esp), %al ## encoding: [0x8a,0x44,0x24,0x04]
3151; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
3152; X86-NEXT:    vaddss %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf1,0x76,0x09,0x58,0xc2]
3153; X86-NEXT:    retl ## encoding: [0xc3]
3154;
3155; X64-LABEL: test_mm_mask_add_ss:
3156; X64:       ## %bb.0: ## %entry
3157; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
3158; X64-NEXT:    vaddss %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf1,0x76,0x09,0x58,0xc2]
3159; X64-NEXT:    retq ## encoding: [0xc3]
3160entry:
3161  %vecext.i.i = extractelement <4 x float> %__B, i32 0
3162  %vecext1.i.i = extractelement <4 x float> %__A, i32 0
3163  %add.i.i = fadd float %vecext1.i.i, %vecext.i.i
3164  %0 = and i8 %__U, 1
3165  %tobool.i = icmp eq i8 %0, 0
3166  %vecext1.i = extractelement <4 x float> %__W, i32 0
3167  %cond.i = select i1 %tobool.i, float %vecext1.i, float %add.i.i
3168  %vecins.i = insertelement <4 x float> %__A, float %cond.i, i32 0
3169  ret <4 x float> %vecins.i
3170}
3171
3172define <4 x float> @test_mm_maskz_add_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
3173; X86-LABEL: test_mm_maskz_add_ss:
3174; X86:       ## %bb.0: ## %entry
3175; X86-NEXT:    movb {{[0-9]+}}(%esp), %al ## encoding: [0x8a,0x44,0x24,0x04]
3176; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
3177; X86-NEXT:    vaddss %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7e,0x89,0x58,0xc1]
3178; X86-NEXT:    retl ## encoding: [0xc3]
3179;
3180; X64-LABEL: test_mm_maskz_add_ss:
3181; X64:       ## %bb.0: ## %entry
3182; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
3183; X64-NEXT:    vaddss %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7e,0x89,0x58,0xc1]
3184; X64-NEXT:    retq ## encoding: [0xc3]
3185entry:
3186  %vecext.i.i = extractelement <4 x float> %__B, i32 0
3187  %vecext1.i.i = extractelement <4 x float> %__A, i32 0
3188  %add.i.i = fadd float %vecext1.i.i, %vecext.i.i
3189  %0 = and i8 %__U, 1
3190  %tobool.i = icmp eq i8 %0, 0
3191  %cond.i = select i1 %tobool.i, float 0.000000e+00, float %add.i.i
3192  %vecins.i = insertelement <4 x float> %__A, float %cond.i, i32 0
3193  ret <4 x float> %vecins.i
3194}
3195
3196define <2 x double> @test_mm_mask_add_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
3197; X86-LABEL: test_mm_mask_add_sd:
3198; X86:       ## %bb.0: ## %entry
3199; X86-NEXT:    movb {{[0-9]+}}(%esp), %al ## encoding: [0x8a,0x44,0x24,0x04]
3200; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
3201; X86-NEXT:    vaddsd %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf1,0xf7,0x09,0x58,0xc2]
3202; X86-NEXT:    retl ## encoding: [0xc3]
3203;
3204; X64-LABEL: test_mm_mask_add_sd:
3205; X64:       ## %bb.0: ## %entry
3206; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
3207; X64-NEXT:    vaddsd %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf1,0xf7,0x09,0x58,0xc2]
3208; X64-NEXT:    retq ## encoding: [0xc3]
3209entry:
3210  %vecext.i.i = extractelement <2 x double> %__B, i32 0
3211  %vecext1.i.i = extractelement <2 x double> %__A, i32 0
3212  %add.i.i = fadd double %vecext1.i.i, %vecext.i.i
3213  %0 = and i8 %__U, 1
3214  %tobool.i = icmp eq i8 %0, 0
3215  %vecext1.i = extractelement <2 x double> %__W, i32 0
3216  %cond.i = select i1 %tobool.i, double %vecext1.i, double %add.i.i
3217  %vecins.i = insertelement <2 x double> %__A, double %cond.i, i32 0
3218  ret <2 x double> %vecins.i
3219}
3220
3221define <2 x double> @test_mm_maskz_add_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
3222; X86-LABEL: test_mm_maskz_add_sd:
3223; X86:       ## %bb.0: ## %entry
3224; X86-NEXT:    movb {{[0-9]+}}(%esp), %al ## encoding: [0x8a,0x44,0x24,0x04]
3225; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
3226; X86-NEXT:    vaddsd %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xff,0x89,0x58,0xc1]
3227; X86-NEXT:    retl ## encoding: [0xc3]
3228;
3229; X64-LABEL: test_mm_maskz_add_sd:
3230; X64:       ## %bb.0: ## %entry
3231; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
3232; X64-NEXT:    vaddsd %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xff,0x89,0x58,0xc1]
3233; X64-NEXT:    retq ## encoding: [0xc3]
3234entry:
3235  %vecext.i.i = extractelement <2 x double> %__B, i32 0
3236  %vecext1.i.i = extractelement <2 x double> %__A, i32 0
3237  %add.i.i = fadd double %vecext1.i.i, %vecext.i.i
3238  %0 = and i8 %__U, 1
3239  %tobool.i = icmp eq i8 %0, 0
3240  %cond.i = select i1 %tobool.i, double 0.000000e+00, double %add.i.i
3241  %vecins.i = insertelement <2 x double> %__A, double %cond.i, i32 0
3242  ret <2 x double> %vecins.i
3243}
3244
3245define <4 x float> @test_mm_mask_sub_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
3246; X86-LABEL: test_mm_mask_sub_ss:
3247; X86:       ## %bb.0: ## %entry
3248; X86-NEXT:    movb {{[0-9]+}}(%esp), %al ## encoding: [0x8a,0x44,0x24,0x04]
3249; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
3250; X86-NEXT:    vsubss %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf1,0x76,0x09,0x5c,0xc2]
3251; X86-NEXT:    retl ## encoding: [0xc3]
3252;
3253; X64-LABEL: test_mm_mask_sub_ss:
3254; X64:       ## %bb.0: ## %entry
3255; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
3256; X64-NEXT:    vsubss %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf1,0x76,0x09,0x5c,0xc2]
3257; X64-NEXT:    retq ## encoding: [0xc3]
3258entry:
3259  %vecext.i.i = extractelement <4 x float> %__B, i32 0
3260  %vecext1.i.i = extractelement <4 x float> %__A, i32 0
3261  %sub.i.i = fsub float %vecext1.i.i, %vecext.i.i
3262  %0 = and i8 %__U, 1
3263  %tobool.i = icmp eq i8 %0, 0
3264  %vecext1.i = extractelement <4 x float> %__W, i32 0
3265  %cond.i = select i1 %tobool.i, float %vecext1.i, float %sub.i.i
3266  %vecins.i = insertelement <4 x float> %__A, float %cond.i, i32 0
3267  ret <4 x float> %vecins.i
3268}
3269
3270define <4 x float> @test_mm_maskz_sub_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
3271; X86-LABEL: test_mm_maskz_sub_ss:
3272; X86:       ## %bb.0: ## %entry
3273; X86-NEXT:    movb {{[0-9]+}}(%esp), %al ## encoding: [0x8a,0x44,0x24,0x04]
3274; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
3275; X86-NEXT:    vsubss %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7e,0x89,0x5c,0xc1]
3276; X86-NEXT:    retl ## encoding: [0xc3]
3277;
3278; X64-LABEL: test_mm_maskz_sub_ss:
3279; X64:       ## %bb.0: ## %entry
3280; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
3281; X64-NEXT:    vsubss %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7e,0x89,0x5c,0xc1]
3282; X64-NEXT:    retq ## encoding: [0xc3]
3283entry:
3284  %vecext.i.i = extractelement <4 x float> %__B, i32 0
3285  %vecext1.i.i = extractelement <4 x float> %__A, i32 0
3286  %sub.i.i = fsub float %vecext1.i.i, %vecext.i.i
3287  %0 = and i8 %__U, 1
3288  %tobool.i = icmp eq i8 %0, 0
3289  %cond.i = select i1 %tobool.i, float 0.000000e+00, float %sub.i.i
3290  %vecins.i = insertelement <4 x float> %__A, float %cond.i, i32 0
3291  ret <4 x float> %vecins.i
3292}
3293
3294define <2 x double> @test_mm_mask_sub_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
3295; X86-LABEL: test_mm_mask_sub_sd:
3296; X86:       ## %bb.0: ## %entry
3297; X86-NEXT:    movb {{[0-9]+}}(%esp), %al ## encoding: [0x8a,0x44,0x24,0x04]
3298; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
3299; X86-NEXT:    vsubsd %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf1,0xf7,0x09,0x5c,0xc2]
3300; X86-NEXT:    retl ## encoding: [0xc3]
3301;
3302; X64-LABEL: test_mm_mask_sub_sd:
3303; X64:       ## %bb.0: ## %entry
3304; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
3305; X64-NEXT:    vsubsd %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf1,0xf7,0x09,0x5c,0xc2]
3306; X64-NEXT:    retq ## encoding: [0xc3]
3307entry:
3308  %vecext.i.i = extractelement <2 x double> %__B, i32 0
3309  %vecext1.i.i = extractelement <2 x double> %__A, i32 0
3310  %sub.i.i = fsub double %vecext1.i.i, %vecext.i.i
3311  %0 = and i8 %__U, 1
3312  %tobool.i = icmp eq i8 %0, 0
3313  %vecext1.i = extractelement <2 x double> %__W, i32 0
3314  %cond.i = select i1 %tobool.i, double %vecext1.i, double %sub.i.i
3315  %vecins.i = insertelement <2 x double> %__A, double %cond.i, i32 0
3316  ret <2 x double> %vecins.i
3317}
3318
3319define <2 x double> @test_mm_maskz_sub_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
3320; X86-LABEL: test_mm_maskz_sub_sd:
3321; X86:       ## %bb.0: ## %entry
3322; X86-NEXT:    movb {{[0-9]+}}(%esp), %al ## encoding: [0x8a,0x44,0x24,0x04]
3323; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
3324; X86-NEXT:    vsubsd %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xff,0x89,0x5c,0xc1]
3325; X86-NEXT:    retl ## encoding: [0xc3]
3326;
3327; X64-LABEL: test_mm_maskz_sub_sd:
3328; X64:       ## %bb.0: ## %entry
3329; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
3330; X64-NEXT:    vsubsd %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xff,0x89,0x5c,0xc1]
3331; X64-NEXT:    retq ## encoding: [0xc3]
3332entry:
3333  %vecext.i.i = extractelement <2 x double> %__B, i32 0
3334  %vecext1.i.i = extractelement <2 x double> %__A, i32 0
3335  %sub.i.i = fsub double %vecext1.i.i, %vecext.i.i
3336  %0 = and i8 %__U, 1
3337  %tobool.i = icmp eq i8 %0, 0
3338  %cond.i = select i1 %tobool.i, double 0.000000e+00, double %sub.i.i
3339  %vecins.i = insertelement <2 x double> %__A, double %cond.i, i32 0
3340  ret <2 x double> %vecins.i
3341}
3342
3343define <4 x float> @test_mm_mask_mul_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
3344; X86-LABEL: test_mm_mask_mul_ss:
3345; X86:       ## %bb.0: ## %entry
3346; X86-NEXT:    movb {{[0-9]+}}(%esp), %al ## encoding: [0x8a,0x44,0x24,0x04]
3347; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
3348; X86-NEXT:    vmulss %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf1,0x76,0x09,0x59,0xc2]
3349; X86-NEXT:    retl ## encoding: [0xc3]
3350;
3351; X64-LABEL: test_mm_mask_mul_ss:
3352; X64:       ## %bb.0: ## %entry
3353; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
3354; X64-NEXT:    vmulss %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf1,0x76,0x09,0x59,0xc2]
3355; X64-NEXT:    retq ## encoding: [0xc3]
3356entry:
3357  %vecext.i.i = extractelement <4 x float> %__B, i32 0
3358  %vecext1.i.i = extractelement <4 x float> %__A, i32 0
3359  %mul.i.i = fmul float %vecext1.i.i, %vecext.i.i
3360  %0 = and i8 %__U, 1
3361  %tobool.i = icmp eq i8 %0, 0
3362  %vecext1.i = extractelement <4 x float> %__W, i32 0
3363  %cond.i = select i1 %tobool.i, float %vecext1.i, float %mul.i.i
3364  %vecins.i = insertelement <4 x float> %__A, float %cond.i, i32 0
3365  ret <4 x float> %vecins.i
3366}
3367
3368define <4 x float> @test_mm_maskz_mul_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
3369; X86-LABEL: test_mm_maskz_mul_ss:
3370; X86:       ## %bb.0: ## %entry
3371; X86-NEXT:    movb {{[0-9]+}}(%esp), %al ## encoding: [0x8a,0x44,0x24,0x04]
3372; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
3373; X86-NEXT:    vmulss %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7e,0x89,0x59,0xc1]
3374; X86-NEXT:    retl ## encoding: [0xc3]
3375;
3376; X64-LABEL: test_mm_maskz_mul_ss:
3377; X64:       ## %bb.0: ## %entry
3378; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
3379; X64-NEXT:    vmulss %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7e,0x89,0x59,0xc1]
3380; X64-NEXT:    retq ## encoding: [0xc3]
3381entry:
3382  %vecext.i.i = extractelement <4 x float> %__B, i32 0
3383  %vecext1.i.i = extractelement <4 x float> %__A, i32 0
3384  %mul.i.i = fmul float %vecext1.i.i, %vecext.i.i
3385  %0 = and i8 %__U, 1
3386  %tobool.i = icmp eq i8 %0, 0
3387  %cond.i = select i1 %tobool.i, float 0.000000e+00, float %mul.i.i
3388  %vecins.i = insertelement <4 x float> %__A, float %cond.i, i32 0
3389  ret <4 x float> %vecins.i
3390}
3391
3392define <2 x double> @test_mm_mask_mul_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
3393; X86-LABEL: test_mm_mask_mul_sd:
3394; X86:       ## %bb.0: ## %entry
3395; X86-NEXT:    movb {{[0-9]+}}(%esp), %al ## encoding: [0x8a,0x44,0x24,0x04]
3396; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
3397; X86-NEXT:    vmulsd %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf1,0xf7,0x09,0x59,0xc2]
3398; X86-NEXT:    retl ## encoding: [0xc3]
3399;
3400; X64-LABEL: test_mm_mask_mul_sd:
3401; X64:       ## %bb.0: ## %entry
3402; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
3403; X64-NEXT:    vmulsd %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf1,0xf7,0x09,0x59,0xc2]
3404; X64-NEXT:    retq ## encoding: [0xc3]
3405entry:
3406  %vecext.i.i = extractelement <2 x double> %__B, i32 0
3407  %vecext1.i.i = extractelement <2 x double> %__A, i32 0
3408  %mul.i.i = fmul double %vecext1.i.i, %vecext.i.i
3409  %0 = and i8 %__U, 1
3410  %tobool.i = icmp eq i8 %0, 0
3411  %vecext1.i = extractelement <2 x double> %__W, i32 0
3412  %cond.i = select i1 %tobool.i, double %vecext1.i, double %mul.i.i
3413  %vecins.i = insertelement <2 x double> %__A, double %cond.i, i32 0
3414  ret <2 x double> %vecins.i
3415}
3416
3417define <2 x double> @test_mm_maskz_mul_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
3418; X86-LABEL: test_mm_maskz_mul_sd:
3419; X86:       ## %bb.0: ## %entry
3420; X86-NEXT:    movb {{[0-9]+}}(%esp), %al ## encoding: [0x8a,0x44,0x24,0x04]
3421; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
3422; X86-NEXT:    vmulsd %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xff,0x89,0x59,0xc1]
3423; X86-NEXT:    retl ## encoding: [0xc3]
3424;
3425; X64-LABEL: test_mm_maskz_mul_sd:
3426; X64:       ## %bb.0: ## %entry
3427; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
3428; X64-NEXT:    vmulsd %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xff,0x89,0x59,0xc1]
3429; X64-NEXT:    retq ## encoding: [0xc3]
3430entry:
3431  %vecext.i.i = extractelement <2 x double> %__B, i32 0
3432  %vecext1.i.i = extractelement <2 x double> %__A, i32 0
3433  %mul.i.i = fmul double %vecext1.i.i, %vecext.i.i
3434  %0 = and i8 %__U, 1
3435  %tobool.i = icmp eq i8 %0, 0
3436  %cond.i = select i1 %tobool.i, double 0.000000e+00, double %mul.i.i
3437  %vecins.i = insertelement <2 x double> %__A, double %cond.i, i32 0
3438  ret <2 x double> %vecins.i
3439}
3440
3441define <4 x float> @test_mm_mask_div_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
3442; X86-LABEL: test_mm_mask_div_ss:
3443; X86:       ## %bb.0: ## %entry
3444; X86-NEXT:    movb {{[0-9]+}}(%esp), %al ## encoding: [0x8a,0x44,0x24,0x04]
3445; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
3446; X86-NEXT:    vdivss %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf1,0x76,0x09,0x5e,0xc2]
3447; X86-NEXT:    retl ## encoding: [0xc3]
3448;
3449; X64-LABEL: test_mm_mask_div_ss:
3450; X64:       ## %bb.0: ## %entry
3451; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
3452; X64-NEXT:    vdivss %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf1,0x76,0x09,0x5e,0xc2]
3453; X64-NEXT:    retq ## encoding: [0xc3]
3454entry:
3455  %0 = extractelement <4 x float> %__A, i64 0
3456  %1 = extractelement <4 x float> %__B, i64 0
3457  %2 = extractelement <4 x float> %__W, i64 0
3458  %3 = fdiv float %0, %1
3459  %4 = bitcast i8 %__U to <8 x i1>
3460  %5 = extractelement <8 x i1> %4, i64 0
3461  %6 = select i1 %5, float %3, float %2
3462  %7 = insertelement <4 x float> %__A, float %6, i64 0
3463  ret <4 x float> %7
3464}
3465
3466define <4 x float> @test_mm_maskz_div_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
3467; X86-LABEL: test_mm_maskz_div_ss:
3468; X86:       ## %bb.0: ## %entry
3469; X86-NEXT:    movb {{[0-9]+}}(%esp), %al ## encoding: [0x8a,0x44,0x24,0x04]
3470; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
3471; X86-NEXT:    vdivss %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7e,0x89,0x5e,0xc1]
3472; X86-NEXT:    retl ## encoding: [0xc3]
3473;
3474; X64-LABEL: test_mm_maskz_div_ss:
3475; X64:       ## %bb.0: ## %entry
3476; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
3477; X64-NEXT:    vdivss %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7e,0x89,0x5e,0xc1]
3478; X64-NEXT:    retq ## encoding: [0xc3]
3479entry:
3480  %0 = extractelement <4 x float> %__A, i64 0
3481  %1 = extractelement <4 x float> %__B, i64 0
3482  %2 = fdiv float %0, %1
3483  %3 = bitcast i8 %__U to <8 x i1>
3484  %4 = extractelement <8 x i1> %3, i64 0
3485  %5 = select i1 %4, float %2, float 0.000000e+00
3486  %6 = insertelement <4 x float> %__A, float %5, i64 0
3487  ret <4 x float> %6
3488}
3489
3490define <2 x double> @test_mm_mask_div_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
3491; X86-LABEL: test_mm_mask_div_sd:
3492; X86:       ## %bb.0: ## %entry
3493; X86-NEXT:    movb {{[0-9]+}}(%esp), %al ## encoding: [0x8a,0x44,0x24,0x04]
3494; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
3495; X86-NEXT:    vdivsd %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf1,0xf7,0x09,0x5e,0xc2]
3496; X86-NEXT:    retl ## encoding: [0xc3]
3497;
3498; X64-LABEL: test_mm_mask_div_sd:
3499; X64:       ## %bb.0: ## %entry
3500; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
3501; X64-NEXT:    vdivsd %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf1,0xf7,0x09,0x5e,0xc2]
3502; X64-NEXT:    retq ## encoding: [0xc3]
3503entry:
3504  %0 = extractelement <2 x double> %__A, i64 0
3505  %1 = extractelement <2 x double> %__B, i64 0
3506  %2 = extractelement <2 x double> %__W, i64 0
3507  %3 = fdiv double %0, %1
3508  %4 = bitcast i8 %__U to <8 x i1>
3509  %5 = extractelement <8 x i1> %4, i64 0
3510  %6 = select i1 %5, double %3, double %2
3511  %7 = insertelement <2 x double> %__A, double %6, i64 0
3512  ret <2 x double> %7
3513}
3514
3515define <2 x double> @test_mm_maskz_div_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
3516; X86-LABEL: test_mm_maskz_div_sd:
3517; X86:       ## %bb.0: ## %entry
3518; X86-NEXT:    movb {{[0-9]+}}(%esp), %al ## encoding: [0x8a,0x44,0x24,0x04]
3519; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
3520; X86-NEXT:    vdivsd %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xff,0x89,0x5e,0xc1]
3521; X86-NEXT:    retl ## encoding: [0xc3]
3522;
3523; X64-LABEL: test_mm_maskz_div_sd:
3524; X64:       ## %bb.0: ## %entry
3525; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
3526; X64-NEXT:    vdivsd %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xff,0x89,0x5e,0xc1]
3527; X64-NEXT:    retq ## encoding: [0xc3]
3528entry:
3529  %0 = extractelement <2 x double> %__A, i64 0
3530  %1 = extractelement <2 x double> %__B, i64 0
3531  %2 = fdiv double %0, %1
3532  %3 = bitcast i8 %__U to <8 x i1>
3533  %4 = extractelement <8 x i1> %3, i64 0
3534  %5 = select i1 %4, double %2, double 0.000000e+00
3535  %6 = insertelement <2 x double> %__A, double %5, i64 0
3536  ret <2 x double> %6
3537}
3538
3539declare <8 x double> @llvm.fma.v8f64(<8 x double>, <8 x double>, <8 x double>) #9
3540declare <16 x float> @llvm.fma.v16f32(<16 x float>, <16 x float>, <16 x float>) #9
3541declare float @llvm.fma.f32(float, float, float) #9
3542declare double @llvm.fma.f64(double, double, double) #9
3543