1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+fma,-fma4 -show-mc-encoding | FileCheck %s --check-prefix=CHECK-FMA
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,-fma4 -show-mc-encoding | FileCheck %s --check-prefix=CHECK-AVX512VL
4; RUN: llc < %s -mtriple=x86_64-pc-windows -mattr=+fma,-fma4 -show-mc-encoding | FileCheck %s --check-prefix=CHECK-FMA-WIN
5
6; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/fma-builtins.c
7
8define <4 x float> @test_mm_fmadd_ps(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
9; CHECK-FMA-LABEL: test_mm_fmadd_ps:
10; CHECK-FMA:       # %bb.0: # %entry
11; CHECK-FMA-NEXT:    vfmadd213ps %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0xa8,0xc2]
12; CHECK-FMA-NEXT:    # xmm0 = (xmm1 * xmm0) + xmm2
13; CHECK-FMA-NEXT:    retq # encoding: [0xc3]
14;
15; CHECK-AVX512VL-LABEL: test_mm_fmadd_ps:
16; CHECK-AVX512VL:       # %bb.0: # %entry
17; CHECK-AVX512VL-NEXT:    vfmadd213ps %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0xa8,0xc2]
18; CHECK-AVX512VL-NEXT:    # xmm0 = (xmm1 * xmm0) + xmm2
19; CHECK-AVX512VL-NEXT:    retq # encoding: [0xc3]
20;
21; CHECK-FMA-WIN-LABEL: test_mm_fmadd_ps:
22; CHECK-FMA-WIN:       # %bb.0: # %entry
23; CHECK-FMA-WIN-NEXT:    vmovaps (%rcx), %xmm1 # encoding: [0xc5,0xf8,0x28,0x09]
24; CHECK-FMA-WIN-NEXT:    vmovaps (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x02]
25; CHECK-FMA-WIN-NEXT:    vfmadd213ps (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0x71,0xa8,0x00]
26; CHECK-FMA-WIN-NEXT:    # xmm0 = (xmm1 * xmm0) + mem
27; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
28entry:
29  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %a, <4 x float> %b, <4 x float> %c) #2
30  ret <4 x float> %0
31}
32
33define <2 x double> @test_mm_fmadd_pd(<2 x double> %a, <2 x double> %b, <2 x double> %c) {
34; CHECK-FMA-LABEL: test_mm_fmadd_pd:
35; CHECK-FMA:       # %bb.0: # %entry
36; CHECK-FMA-NEXT:    vfmadd213pd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0xa8,0xc2]
37; CHECK-FMA-NEXT:    # xmm0 = (xmm1 * xmm0) + xmm2
38; CHECK-FMA-NEXT:    retq # encoding: [0xc3]
39;
40; CHECK-AVX512VL-LABEL: test_mm_fmadd_pd:
41; CHECK-AVX512VL:       # %bb.0: # %entry
42; CHECK-AVX512VL-NEXT:    vfmadd213pd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf1,0xa8,0xc2]
43; CHECK-AVX512VL-NEXT:    # xmm0 = (xmm1 * xmm0) + xmm2
44; CHECK-AVX512VL-NEXT:    retq # encoding: [0xc3]
45;
46; CHECK-FMA-WIN-LABEL: test_mm_fmadd_pd:
47; CHECK-FMA-WIN:       # %bb.0: # %entry
48; CHECK-FMA-WIN-NEXT:    vmovapd (%rcx), %xmm1 # encoding: [0xc5,0xf9,0x28,0x09]
49; CHECK-FMA-WIN-NEXT:    vmovapd (%rdx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x02]
50; CHECK-FMA-WIN-NEXT:    vfmadd213pd (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0xf1,0xa8,0x00]
51; CHECK-FMA-WIN-NEXT:    # xmm0 = (xmm1 * xmm0) + mem
52; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
53entry:
54  %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %a, <2 x double> %b, <2 x double> %c) #2
55  ret <2 x double> %0
56}
57
58define <4 x float> @test_mm_fmadd_ss(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
59; CHECK-FMA-LABEL: test_mm_fmadd_ss:
60; CHECK-FMA:       # %bb.0: # %entry
61; CHECK-FMA-NEXT:    vfmadd213ss %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0xa9,0xc2]
62; CHECK-FMA-NEXT:    # xmm0 = (xmm1 * xmm0) + xmm2
63; CHECK-FMA-NEXT:    retq # encoding: [0xc3]
64;
65; CHECK-AVX512VL-LABEL: test_mm_fmadd_ss:
66; CHECK-AVX512VL:       # %bb.0: # %entry
67; CHECK-AVX512VL-NEXT:    vfmadd213ss %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0xa9,0xc2]
68; CHECK-AVX512VL-NEXT:    # xmm0 = (xmm1 * xmm0) + xmm2
69; CHECK-AVX512VL-NEXT:    retq # encoding: [0xc3]
70;
71; CHECK-FMA-WIN-LABEL: test_mm_fmadd_ss:
72; CHECK-FMA-WIN:       # %bb.0: # %entry
73; CHECK-FMA-WIN-NEXT:    vmovaps (%rcx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x01]
74; CHECK-FMA-WIN-NEXT:    vmovss (%r8), %xmm1 # encoding: [0xc4,0xc1,0x7a,0x10,0x08]
75; CHECK-FMA-WIN-NEXT:    # xmm1 = mem[0],zero,zero,zero
76; CHECK-FMA-WIN-NEXT:    vfmadd132ss (%rdx), %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0x99,0x02]
77; CHECK-FMA-WIN-NEXT:    # xmm0 = (xmm0 * mem) + xmm1
78; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
79entry:
80  %0 = extractelement <4 x float> %a, i64 0
81  %1 = extractelement <4 x float> %b, i64 0
82  %2 = extractelement <4 x float> %c, i64 0
83  %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #2
84  %4 = insertelement <4 x float> %a, float %3, i64 0
85  ret <4 x float> %4
86}
87
88define <2 x double> @test_mm_fmadd_sd(<2 x double> %a, <2 x double> %b, <2 x double> %c) {
89; CHECK-FMA-LABEL: test_mm_fmadd_sd:
90; CHECK-FMA:       # %bb.0: # %entry
91; CHECK-FMA-NEXT:    vfmadd213sd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0xa9,0xc2]
92; CHECK-FMA-NEXT:    # xmm0 = (xmm1 * xmm0) + xmm2
93; CHECK-FMA-NEXT:    retq # encoding: [0xc3]
94;
95; CHECK-AVX512VL-LABEL: test_mm_fmadd_sd:
96; CHECK-AVX512VL:       # %bb.0: # %entry
97; CHECK-AVX512VL-NEXT:    vfmadd213sd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf1,0xa9,0xc2]
98; CHECK-AVX512VL-NEXT:    # xmm0 = (xmm1 * xmm0) + xmm2
99; CHECK-AVX512VL-NEXT:    retq # encoding: [0xc3]
100;
101; CHECK-FMA-WIN-LABEL: test_mm_fmadd_sd:
102; CHECK-FMA-WIN:       # %bb.0: # %entry
103; CHECK-FMA-WIN-NEXT:    vmovapd (%rcx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x01]
104; CHECK-FMA-WIN-NEXT:    vmovsd (%r8), %xmm1 # encoding: [0xc4,0xc1,0x7b,0x10,0x08]
105; CHECK-FMA-WIN-NEXT:    # xmm1 = mem[0],zero
106; CHECK-FMA-WIN-NEXT:    vfmadd132sd (%rdx), %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0x99,0x02]
107; CHECK-FMA-WIN-NEXT:    # xmm0 = (xmm0 * mem) + xmm1
108; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
109entry:
110  %0 = extractelement <2 x double> %a, i64 0
111  %1 = extractelement <2 x double> %b, i64 0
112  %2 = extractelement <2 x double> %c, i64 0
113  %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #2
114  %4 = insertelement <2 x double> %a, double %3, i64 0
115  ret <2 x double> %4
116}
117
118define <4 x float> @test_mm_fmsub_ps(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
119; CHECK-FMA-LABEL: test_mm_fmsub_ps:
120; CHECK-FMA:       # %bb.0: # %entry
121; CHECK-FMA-NEXT:    vfmsub213ps %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0xaa,0xc2]
122; CHECK-FMA-NEXT:    # xmm0 = (xmm1 * xmm0) - xmm2
123; CHECK-FMA-NEXT:    retq # encoding: [0xc3]
124;
125; CHECK-AVX512VL-LABEL: test_mm_fmsub_ps:
126; CHECK-AVX512VL:       # %bb.0: # %entry
127; CHECK-AVX512VL-NEXT:    vfmsub213ps %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0xaa,0xc2]
128; CHECK-AVX512VL-NEXT:    # xmm0 = (xmm1 * xmm0) - xmm2
129; CHECK-AVX512VL-NEXT:    retq # encoding: [0xc3]
130;
131; CHECK-FMA-WIN-LABEL: test_mm_fmsub_ps:
132; CHECK-FMA-WIN:       # %bb.0: # %entry
133; CHECK-FMA-WIN-NEXT:    vmovaps (%rcx), %xmm1 # encoding: [0xc5,0xf8,0x28,0x09]
134; CHECK-FMA-WIN-NEXT:    vmovaps (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x02]
135; CHECK-FMA-WIN-NEXT:    vfmsub213ps (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0x71,0xaa,0x00]
136; CHECK-FMA-WIN-NEXT:    # xmm0 = (xmm1 * xmm0) - mem
137; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
138entry:
139  %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %c
140  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %a, <4 x float> %b, <4 x float> %sub.i) #2
141  ret <4 x float> %0
142}
143
144define <2 x double> @test_mm_fmsub_pd(<2 x double> %a, <2 x double> %b, <2 x double> %c) {
145; CHECK-FMA-LABEL: test_mm_fmsub_pd:
146; CHECK-FMA:       # %bb.0: # %entry
147; CHECK-FMA-NEXT:    vfmsub213pd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0xaa,0xc2]
148; CHECK-FMA-NEXT:    # xmm0 = (xmm1 * xmm0) - xmm2
149; CHECK-FMA-NEXT:    retq # encoding: [0xc3]
150;
151; CHECK-AVX512VL-LABEL: test_mm_fmsub_pd:
152; CHECK-AVX512VL:       # %bb.0: # %entry
153; CHECK-AVX512VL-NEXT:    vfmsub213pd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf1,0xaa,0xc2]
154; CHECK-AVX512VL-NEXT:    # xmm0 = (xmm1 * xmm0) - xmm2
155; CHECK-AVX512VL-NEXT:    retq # encoding: [0xc3]
156;
157; CHECK-FMA-WIN-LABEL: test_mm_fmsub_pd:
158; CHECK-FMA-WIN:       # %bb.0: # %entry
159; CHECK-FMA-WIN-NEXT:    vmovapd (%rcx), %xmm1 # encoding: [0xc5,0xf9,0x28,0x09]
160; CHECK-FMA-WIN-NEXT:    vmovapd (%rdx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x02]
161; CHECK-FMA-WIN-NEXT:    vfmsub213pd (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0xf1,0xaa,0x00]
162; CHECK-FMA-WIN-NEXT:    # xmm0 = (xmm1 * xmm0) - mem
163; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
164entry:
165  %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %c
166  %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %a, <2 x double> %b, <2 x double> %sub.i) #2
167  ret <2 x double> %0
168}
169
170define <4 x float> @test_mm_fmsub_ss(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
171; CHECK-FMA-LABEL: test_mm_fmsub_ss:
172; CHECK-FMA:       # %bb.0: # %entry
173; CHECK-FMA-NEXT:    vfmsub213ss %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0xab,0xc2]
174; CHECK-FMA-NEXT:    # xmm0 = (xmm1 * xmm0) - xmm2
175; CHECK-FMA-NEXT:    retq # encoding: [0xc3]
176;
177; CHECK-AVX512VL-LABEL: test_mm_fmsub_ss:
178; CHECK-AVX512VL:       # %bb.0: # %entry
179; CHECK-AVX512VL-NEXT:    vfmsub213ss %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0xab,0xc2]
180; CHECK-AVX512VL-NEXT:    # xmm0 = (xmm1 * xmm0) - xmm2
181; CHECK-AVX512VL-NEXT:    retq # encoding: [0xc3]
182;
183; CHECK-FMA-WIN-LABEL: test_mm_fmsub_ss:
184; CHECK-FMA-WIN:       # %bb.0: # %entry
185; CHECK-FMA-WIN-NEXT:    vmovaps (%rcx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x01]
186; CHECK-FMA-WIN-NEXT:    vmovss (%r8), %xmm1 # encoding: [0xc4,0xc1,0x7a,0x10,0x08]
187; CHECK-FMA-WIN-NEXT:    # xmm1 = mem[0],zero,zero,zero
188; CHECK-FMA-WIN-NEXT:    vfmsub132ss (%rdx), %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0x9b,0x02]
189; CHECK-FMA-WIN-NEXT:    # xmm0 = (xmm0 * mem) - xmm1
190; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
191entry:
192  %0 = extractelement <4 x float> %a, i64 0
193  %1 = extractelement <4 x float> %b, i64 0
194  %.rhs.i = extractelement <4 x float> %c, i64 0
195  %2 = fsub float -0.000000e+00, %.rhs.i
196  %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #2
197  %4 = insertelement <4 x float> %a, float %3, i64 0
198  ret <4 x float> %4
199}
200
201define <2 x double> @test_mm_fmsub_sd(<2 x double> %a, <2 x double> %b, <2 x double> %c) {
202; CHECK-FMA-LABEL: test_mm_fmsub_sd:
203; CHECK-FMA:       # %bb.0: # %entry
204; CHECK-FMA-NEXT:    vfmsub213sd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0xab,0xc2]
205; CHECK-FMA-NEXT:    # xmm0 = (xmm1 * xmm0) - xmm2
206; CHECK-FMA-NEXT:    retq # encoding: [0xc3]
207;
208; CHECK-AVX512VL-LABEL: test_mm_fmsub_sd:
209; CHECK-AVX512VL:       # %bb.0: # %entry
210; CHECK-AVX512VL-NEXT:    vfmsub213sd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf1,0xab,0xc2]
211; CHECK-AVX512VL-NEXT:    # xmm0 = (xmm1 * xmm0) - xmm2
212; CHECK-AVX512VL-NEXT:    retq # encoding: [0xc3]
213;
214; CHECK-FMA-WIN-LABEL: test_mm_fmsub_sd:
215; CHECK-FMA-WIN:       # %bb.0: # %entry
216; CHECK-FMA-WIN-NEXT:    vmovapd (%rcx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x01]
217; CHECK-FMA-WIN-NEXT:    vmovsd (%r8), %xmm1 # encoding: [0xc4,0xc1,0x7b,0x10,0x08]
218; CHECK-FMA-WIN-NEXT:    # xmm1 = mem[0],zero
219; CHECK-FMA-WIN-NEXT:    vfmsub132sd (%rdx), %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0x9b,0x02]
220; CHECK-FMA-WIN-NEXT:    # xmm0 = (xmm0 * mem) - xmm1
221; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
222entry:
223  %0 = extractelement <2 x double> %a, i64 0
224  %1 = extractelement <2 x double> %b, i64 0
225  %.rhs.i = extractelement <2 x double> %c, i64 0
226  %2 = fsub double -0.000000e+00, %.rhs.i
227  %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #2
228  %4 = insertelement <2 x double> %a, double %3, i64 0
229  ret <2 x double> %4
230}
231
232define <4 x float> @test_mm_fnmadd_ps(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
233; CHECK-FMA-LABEL: test_mm_fnmadd_ps:
234; CHECK-FMA:       # %bb.0: # %entry
235; CHECK-FMA-NEXT:    vfnmadd213ps %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0xac,0xc2]
236; CHECK-FMA-NEXT:    # xmm0 = -(xmm1 * xmm0) + xmm2
237; CHECK-FMA-NEXT:    retq # encoding: [0xc3]
238;
239; CHECK-AVX512VL-LABEL: test_mm_fnmadd_ps:
240; CHECK-AVX512VL:       # %bb.0: # %entry
241; CHECK-AVX512VL-NEXT:    vfnmadd213ps %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0xac,0xc2]
242; CHECK-AVX512VL-NEXT:    # xmm0 = -(xmm1 * xmm0) + xmm2
243; CHECK-AVX512VL-NEXT:    retq # encoding: [0xc3]
244;
245; CHECK-FMA-WIN-LABEL: test_mm_fnmadd_ps:
246; CHECK-FMA-WIN:       # %bb.0: # %entry
247; CHECK-FMA-WIN-NEXT:    vmovaps (%rcx), %xmm1 # encoding: [0xc5,0xf8,0x28,0x09]
248; CHECK-FMA-WIN-NEXT:    vmovaps (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x02]
249; CHECK-FMA-WIN-NEXT:    vfnmadd213ps (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0x71,0xac,0x00]
250; CHECK-FMA-WIN-NEXT:    # xmm0 = -(xmm1 * xmm0) + mem
251; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
252entry:
253  %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %a
254  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %sub.i, <4 x float> %b, <4 x float> %c) #2
255  ret <4 x float> %0
256}
257
258define <2 x double> @test_mm_fnmadd_pd(<2 x double> %a, <2 x double> %b, <2 x double> %c) {
259; CHECK-FMA-LABEL: test_mm_fnmadd_pd:
260; CHECK-FMA:       # %bb.0: # %entry
261; CHECK-FMA-NEXT:    vfnmadd213pd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0xac,0xc2]
262; CHECK-FMA-NEXT:    # xmm0 = -(xmm1 * xmm0) + xmm2
263; CHECK-FMA-NEXT:    retq # encoding: [0xc3]
264;
265; CHECK-AVX512VL-LABEL: test_mm_fnmadd_pd:
266; CHECK-AVX512VL:       # %bb.0: # %entry
267; CHECK-AVX512VL-NEXT:    vfnmadd213pd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf1,0xac,0xc2]
268; CHECK-AVX512VL-NEXT:    # xmm0 = -(xmm1 * xmm0) + xmm2
269; CHECK-AVX512VL-NEXT:    retq # encoding: [0xc3]
270;
271; CHECK-FMA-WIN-LABEL: test_mm_fnmadd_pd:
272; CHECK-FMA-WIN:       # %bb.0: # %entry
273; CHECK-FMA-WIN-NEXT:    vmovapd (%rcx), %xmm1 # encoding: [0xc5,0xf9,0x28,0x09]
274; CHECK-FMA-WIN-NEXT:    vmovapd (%rdx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x02]
275; CHECK-FMA-WIN-NEXT:    vfnmadd213pd (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0xf1,0xac,0x00]
276; CHECK-FMA-WIN-NEXT:    # xmm0 = -(xmm1 * xmm0) + mem
277; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
278entry:
279  %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %a
280  %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %sub.i, <2 x double> %b, <2 x double> %c) #2
281  ret <2 x double> %0
282}
283
284define <4 x float> @test_mm_fnmadd_ss(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
285; CHECK-FMA-LABEL: test_mm_fnmadd_ss:
286; CHECK-FMA:       # %bb.0: # %entry
287; CHECK-FMA-NEXT:    vfnmadd213ss %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0xad,0xc2]
288; CHECK-FMA-NEXT:    # xmm0 = -(xmm1 * xmm0) + xmm2
289; CHECK-FMA-NEXT:    retq # encoding: [0xc3]
290;
291; CHECK-AVX512VL-LABEL: test_mm_fnmadd_ss:
292; CHECK-AVX512VL:       # %bb.0: # %entry
293; CHECK-AVX512VL-NEXT:    vfnmadd213ss %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0xad,0xc2]
294; CHECK-AVX512VL-NEXT:    # xmm0 = -(xmm1 * xmm0) + xmm2
295; CHECK-AVX512VL-NEXT:    retq # encoding: [0xc3]
296;
297; CHECK-FMA-WIN-LABEL: test_mm_fnmadd_ss:
298; CHECK-FMA-WIN:       # %bb.0: # %entry
299; CHECK-FMA-WIN-NEXT:    vmovaps (%rcx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x01]
300; CHECK-FMA-WIN-NEXT:    vmovss (%r8), %xmm1 # encoding: [0xc4,0xc1,0x7a,0x10,0x08]
301; CHECK-FMA-WIN-NEXT:    # xmm1 = mem[0],zero,zero,zero
302; CHECK-FMA-WIN-NEXT:    vfnmadd132ss (%rdx), %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0x9d,0x02]
303; CHECK-FMA-WIN-NEXT:    # xmm0 = -(xmm0 * mem) + xmm1
304; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
305entry:
306  %0 = extractelement <4 x float> %a, i64 0
307  %.rhs.i = extractelement <4 x float> %b, i64 0
308  %1 = fsub float -0.000000e+00, %.rhs.i
309  %2 = extractelement <4 x float> %c, i64 0
310  %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #2
311  %4 = insertelement <4 x float> %a, float %3, i64 0
312  ret <4 x float> %4
313}
314
315define <2 x double> @test_mm_fnmadd_sd(<2 x double> %a, <2 x double> %b, <2 x double> %c) {
316; CHECK-FMA-LABEL: test_mm_fnmadd_sd:
317; CHECK-FMA:       # %bb.0: # %entry
318; CHECK-FMA-NEXT:    vfnmadd213sd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0xad,0xc2]
319; CHECK-FMA-NEXT:    # xmm0 = -(xmm1 * xmm0) + xmm2
320; CHECK-FMA-NEXT:    retq # encoding: [0xc3]
321;
322; CHECK-AVX512VL-LABEL: test_mm_fnmadd_sd:
323; CHECK-AVX512VL:       # %bb.0: # %entry
324; CHECK-AVX512VL-NEXT:    vfnmadd213sd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf1,0xad,0xc2]
325; CHECK-AVX512VL-NEXT:    # xmm0 = -(xmm1 * xmm0) + xmm2
326; CHECK-AVX512VL-NEXT:    retq # encoding: [0xc3]
327;
328; CHECK-FMA-WIN-LABEL: test_mm_fnmadd_sd:
329; CHECK-FMA-WIN:       # %bb.0: # %entry
330; CHECK-FMA-WIN-NEXT:    vmovapd (%rcx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x01]
331; CHECK-FMA-WIN-NEXT:    vmovsd (%r8), %xmm1 # encoding: [0xc4,0xc1,0x7b,0x10,0x08]
332; CHECK-FMA-WIN-NEXT:    # xmm1 = mem[0],zero
333; CHECK-FMA-WIN-NEXT:    vfnmadd132sd (%rdx), %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0x9d,0x02]
334; CHECK-FMA-WIN-NEXT:    # xmm0 = -(xmm0 * mem) + xmm1
335; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
336entry:
337  %0 = extractelement <2 x double> %a, i64 0
338  %.rhs.i = extractelement <2 x double> %b, i64 0
339  %1 = fsub double -0.000000e+00, %.rhs.i
340  %2 = extractelement <2 x double> %c, i64 0
341  %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #2
342  %4 = insertelement <2 x double> %a, double %3, i64 0
343  ret <2 x double> %4
344}
345
346define <4 x float> @test_mm_fnmsub_ps(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
347; CHECK-FMA-LABEL: test_mm_fnmsub_ps:
348; CHECK-FMA:       # %bb.0: # %entry
349; CHECK-FMA-NEXT:    vfnmsub213ps %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0xae,0xc2]
350; CHECK-FMA-NEXT:    # xmm0 = -(xmm1 * xmm0) - xmm2
351; CHECK-FMA-NEXT:    retq # encoding: [0xc3]
352;
353; CHECK-AVX512VL-LABEL: test_mm_fnmsub_ps:
354; CHECK-AVX512VL:       # %bb.0: # %entry
355; CHECK-AVX512VL-NEXT:    vfnmsub213ps %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0xae,0xc2]
356; CHECK-AVX512VL-NEXT:    # xmm0 = -(xmm1 * xmm0) - xmm2
357; CHECK-AVX512VL-NEXT:    retq # encoding: [0xc3]
358;
359; CHECK-FMA-WIN-LABEL: test_mm_fnmsub_ps:
360; CHECK-FMA-WIN:       # %bb.0: # %entry
361; CHECK-FMA-WIN-NEXT:    vmovaps (%rcx), %xmm1 # encoding: [0xc5,0xf8,0x28,0x09]
362; CHECK-FMA-WIN-NEXT:    vmovaps (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x02]
363; CHECK-FMA-WIN-NEXT:    vfnmsub213ps (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0x71,0xae,0x00]
364; CHECK-FMA-WIN-NEXT:    # xmm0 = -(xmm1 * xmm0) - mem
365; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
366entry:
367  %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %a
368  %sub1.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %c
369  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %sub.i, <4 x float> %b, <4 x float> %sub1.i) #2
370  ret <4 x float> %0
371}
372
373define <2 x double> @test_mm_fnmsub_pd(<2 x double> %a, <2 x double> %b, <2 x double> %c) {
374; CHECK-FMA-LABEL: test_mm_fnmsub_pd:
375; CHECK-FMA:       # %bb.0: # %entry
376; CHECK-FMA-NEXT:    vfnmsub213pd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0xae,0xc2]
377; CHECK-FMA-NEXT:    # xmm0 = -(xmm1 * xmm0) - xmm2
378; CHECK-FMA-NEXT:    retq # encoding: [0xc3]
379;
380; CHECK-AVX512VL-LABEL: test_mm_fnmsub_pd:
381; CHECK-AVX512VL:       # %bb.0: # %entry
382; CHECK-AVX512VL-NEXT:    vfnmsub213pd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf1,0xae,0xc2]
383; CHECK-AVX512VL-NEXT:    # xmm0 = -(xmm1 * xmm0) - xmm2
384; CHECK-AVX512VL-NEXT:    retq # encoding: [0xc3]
385;
386; CHECK-FMA-WIN-LABEL: test_mm_fnmsub_pd:
387; CHECK-FMA-WIN:       # %bb.0: # %entry
388; CHECK-FMA-WIN-NEXT:    vmovapd (%rcx), %xmm1 # encoding: [0xc5,0xf9,0x28,0x09]
389; CHECK-FMA-WIN-NEXT:    vmovapd (%rdx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x02]
390; CHECK-FMA-WIN-NEXT:    vfnmsub213pd (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0xf1,0xae,0x00]
391; CHECK-FMA-WIN-NEXT:    # xmm0 = -(xmm1 * xmm0) - mem
392; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
393entry:
394  %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %a
395  %sub1.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %c
396  %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %sub.i, <2 x double> %b, <2 x double> %sub1.i) #2
397  ret <2 x double> %0
398}
399
400define <4 x float> @test_mm_fnmsub_ss(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
401; CHECK-FMA-LABEL: test_mm_fnmsub_ss:
402; CHECK-FMA:       # %bb.0: # %entry
403; CHECK-FMA-NEXT:    vfnmsub213ss %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0xaf,0xc2]
404; CHECK-FMA-NEXT:    # xmm0 = -(xmm1 * xmm0) - xmm2
405; CHECK-FMA-NEXT:    retq # encoding: [0xc3]
406;
407; CHECK-AVX512VL-LABEL: test_mm_fnmsub_ss:
408; CHECK-AVX512VL:       # %bb.0: # %entry
409; CHECK-AVX512VL-NEXT:    vfnmsub213ss %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0xaf,0xc2]
410; CHECK-AVX512VL-NEXT:    # xmm0 = -(xmm1 * xmm0) - xmm2
411; CHECK-AVX512VL-NEXT:    retq # encoding: [0xc3]
412;
413; CHECK-FMA-WIN-LABEL: test_mm_fnmsub_ss:
414; CHECK-FMA-WIN:       # %bb.0: # %entry
415; CHECK-FMA-WIN-NEXT:    vmovaps (%rcx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x01]
416; CHECK-FMA-WIN-NEXT:    vmovss (%r8), %xmm1 # encoding: [0xc4,0xc1,0x7a,0x10,0x08]
417; CHECK-FMA-WIN-NEXT:    # xmm1 = mem[0],zero,zero,zero
418; CHECK-FMA-WIN-NEXT:    vfnmsub132ss (%rdx), %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0x9f,0x02]
419; CHECK-FMA-WIN-NEXT:    # xmm0 = -(xmm0 * mem) - xmm1
420; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
421entry:
422  %0 = extractelement <4 x float> %a, i64 0
423  %.rhs.i = extractelement <4 x float> %b, i64 0
424  %1 = fsub float -0.000000e+00, %.rhs.i
425  %.rhs2.i = extractelement <4 x float> %c, i64 0
426  %2 = fsub float -0.000000e+00, %.rhs2.i
427  %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #2
428  %4 = insertelement <4 x float> %a, float %3, i64 0
429  ret <4 x float> %4
430}
431
432define <2 x double> @test_mm_fnmsub_sd(<2 x double> %a, <2 x double> %b, <2 x double> %c) {
433; CHECK-FMA-LABEL: test_mm_fnmsub_sd:
434; CHECK-FMA:       # %bb.0: # %entry
435; CHECK-FMA-NEXT:    vfnmsub213sd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0xaf,0xc2]
436; CHECK-FMA-NEXT:    # xmm0 = -(xmm1 * xmm0) - xmm2
437; CHECK-FMA-NEXT:    retq # encoding: [0xc3]
438;
439; CHECK-AVX512VL-LABEL: test_mm_fnmsub_sd:
440; CHECK-AVX512VL:       # %bb.0: # %entry
441; CHECK-AVX512VL-NEXT:    vfnmsub213sd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf1,0xaf,0xc2]
442; CHECK-AVX512VL-NEXT:    # xmm0 = -(xmm1 * xmm0) - xmm2
443; CHECK-AVX512VL-NEXT:    retq # encoding: [0xc3]
444;
445; CHECK-FMA-WIN-LABEL: test_mm_fnmsub_sd:
446; CHECK-FMA-WIN:       # %bb.0: # %entry
447; CHECK-FMA-WIN-NEXT:    vmovapd (%rcx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x01]
448; CHECK-FMA-WIN-NEXT:    vmovsd (%r8), %xmm1 # encoding: [0xc4,0xc1,0x7b,0x10,0x08]
449; CHECK-FMA-WIN-NEXT:    # xmm1 = mem[0],zero
450; CHECK-FMA-WIN-NEXT:    vfnmsub132sd (%rdx), %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0x9f,0x02]
451; CHECK-FMA-WIN-NEXT:    # xmm0 = -(xmm0 * mem) - xmm1
452; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
453entry:
454  %0 = extractelement <2 x double> %a, i64 0
455  %.rhs.i = extractelement <2 x double> %b, i64 0
456  %1 = fsub double -0.000000e+00, %.rhs.i
457  %.rhs2.i = extractelement <2 x double> %c, i64 0
458  %2 = fsub double -0.000000e+00, %.rhs2.i
459  %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #2
460  %4 = insertelement <2 x double> %a, double %3, i64 0
461  ret <2 x double> %4
462}
463
464define <4 x float> @test_mm_fmaddsub_ps(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
465; CHECK-FMA-LABEL: test_mm_fmaddsub_ps:
466; CHECK-FMA:       # %bb.0: # %entry
467; CHECK-FMA-NEXT:    vfmaddsub213ps %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0xa6,0xc2]
468; CHECK-FMA-NEXT:    # xmm0 = (xmm1 * xmm0) +/- xmm2
469; CHECK-FMA-NEXT:    retq # encoding: [0xc3]
470;
471; CHECK-AVX512VL-LABEL: test_mm_fmaddsub_ps:
472; CHECK-AVX512VL:       # %bb.0: # %entry
473; CHECK-AVX512VL-NEXT:    vfmaddsub213ps %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0xa6,0xc2]
474; CHECK-AVX512VL-NEXT:    # xmm0 = (xmm1 * xmm0) +/- xmm2
475; CHECK-AVX512VL-NEXT:    retq # encoding: [0xc3]
476;
477; CHECK-FMA-WIN-LABEL: test_mm_fmaddsub_ps:
478; CHECK-FMA-WIN:       # %bb.0: # %entry
479; CHECK-FMA-WIN-NEXT:    vmovaps (%rcx), %xmm1 # encoding: [0xc5,0xf8,0x28,0x09]
480; CHECK-FMA-WIN-NEXT:    vmovaps (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x02]
481; CHECK-FMA-WIN-NEXT:    vfmaddsub213ps (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0x71,0xa6,0x00]
482; CHECK-FMA-WIN-NEXT:    # xmm0 = (xmm1 * xmm0) +/- mem
483; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
484entry:
485  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %a, <4 x float> %b, <4 x float> %c) #2
486  %1 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %c
487  %2 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %a, <4 x float> %b, <4 x float> %1) #2
488  %3 = shufflevector <4 x float> %2, <4 x float> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
489  ret <4 x float> %3
490}
491
492define <2 x double> @test_mm_fmaddsub_pd(<2 x double> %a, <2 x double> %b, <2 x double> %c) {
493; CHECK-FMA-LABEL: test_mm_fmaddsub_pd:
494; CHECK-FMA:       # %bb.0: # %entry
495; CHECK-FMA-NEXT:    vfmaddsub213pd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0xa6,0xc2]
496; CHECK-FMA-NEXT:    # xmm0 = (xmm1 * xmm0) +/- xmm2
497; CHECK-FMA-NEXT:    retq # encoding: [0xc3]
498;
499; CHECK-AVX512VL-LABEL: test_mm_fmaddsub_pd:
500; CHECK-AVX512VL:       # %bb.0: # %entry
501; CHECK-AVX512VL-NEXT:    vfmaddsub213pd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf1,0xa6,0xc2]
502; CHECK-AVX512VL-NEXT:    # xmm0 = (xmm1 * xmm0) +/- xmm2
503; CHECK-AVX512VL-NEXT:    retq # encoding: [0xc3]
504;
505; CHECK-FMA-WIN-LABEL: test_mm_fmaddsub_pd:
506; CHECK-FMA-WIN:       # %bb.0: # %entry
507; CHECK-FMA-WIN-NEXT:    vmovapd (%rcx), %xmm1 # encoding: [0xc5,0xf9,0x28,0x09]
508; CHECK-FMA-WIN-NEXT:    vmovapd (%rdx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x02]
509; CHECK-FMA-WIN-NEXT:    vfmaddsub213pd (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0xf1,0xa6,0x00]
510; CHECK-FMA-WIN-NEXT:    # xmm0 = (xmm1 * xmm0) +/- mem
511; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
512entry:
513  %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %a, <2 x double> %b, <2 x double> %c) #2
514  %1 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %c
515  %2 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %a, <2 x double> %b, <2 x double> %1) #2
516  %3 = shufflevector <2 x double> %2, <2 x double> %0, <2 x i32> <i32 0, i32 3>
517  ret <2 x double> %3
518}
519
520define <4 x float> @test_mm_fmsubadd_ps(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
521; CHECK-FMA-LABEL: test_mm_fmsubadd_ps:
522; CHECK-FMA:       # %bb.0: # %entry
523; CHECK-FMA-NEXT:    vfmsubadd213ps %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0xa7,0xc2]
524; CHECK-FMA-NEXT:    # xmm0 = (xmm1 * xmm0) -/+ xmm2
525; CHECK-FMA-NEXT:    retq # encoding: [0xc3]
526;
527; CHECK-AVX512VL-LABEL: test_mm_fmsubadd_ps:
528; CHECK-AVX512VL:       # %bb.0: # %entry
529; CHECK-AVX512VL-NEXT:    vfmsubadd213ps %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0xa7,0xc2]
530; CHECK-AVX512VL-NEXT:    # xmm0 = (xmm1 * xmm0) -/+ xmm2
531; CHECK-AVX512VL-NEXT:    retq # encoding: [0xc3]
532;
533; CHECK-FMA-WIN-LABEL: test_mm_fmsubadd_ps:
534; CHECK-FMA-WIN:       # %bb.0: # %entry
535; CHECK-FMA-WIN-NEXT:    vmovaps (%rcx), %xmm1 # encoding: [0xc5,0xf8,0x28,0x09]
536; CHECK-FMA-WIN-NEXT:    vmovaps (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x02]
537; CHECK-FMA-WIN-NEXT:    vfmsubadd213ps (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0x71,0xa7,0x00]
538; CHECK-FMA-WIN-NEXT:    # xmm0 = (xmm1 * xmm0) -/+ mem
539; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
540entry:
541  %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %c
542  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %a, <4 x float> %b, <4 x float> %sub.i) #2
543  %1 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %a, <4 x float> %b, <4 x float> %c) #2
544  %2 = shufflevector <4 x float> %1, <4 x float> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
545  ret <4 x float> %2
546}
547
548define <2 x double> @test_mm_fmsubadd_pd(<2 x double> %a, <2 x double> %b, <2 x double> %c) {
549; CHECK-FMA-LABEL: test_mm_fmsubadd_pd:
550; CHECK-FMA:       # %bb.0: # %entry
551; CHECK-FMA-NEXT:    vfmsubadd213pd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0xa7,0xc2]
552; CHECK-FMA-NEXT:    # xmm0 = (xmm1 * xmm0) -/+ xmm2
553; CHECK-FMA-NEXT:    retq # encoding: [0xc3]
554;
555; CHECK-AVX512VL-LABEL: test_mm_fmsubadd_pd:
556; CHECK-AVX512VL:       # %bb.0: # %entry
557; CHECK-AVX512VL-NEXT:    vfmsubadd213pd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf1,0xa7,0xc2]
558; CHECK-AVX512VL-NEXT:    # xmm0 = (xmm1 * xmm0) -/+ xmm2
559; CHECK-AVX512VL-NEXT:    retq # encoding: [0xc3]
560;
561; CHECK-FMA-WIN-LABEL: test_mm_fmsubadd_pd:
562; CHECK-FMA-WIN:       # %bb.0: # %entry
563; CHECK-FMA-WIN-NEXT:    vmovapd (%rcx), %xmm1 # encoding: [0xc5,0xf9,0x28,0x09]
564; CHECK-FMA-WIN-NEXT:    vmovapd (%rdx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x02]
565; CHECK-FMA-WIN-NEXT:    vfmsubadd213pd (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0xf1,0xa7,0x00]
566; CHECK-FMA-WIN-NEXT:    # xmm0 = (xmm1 * xmm0) -/+ mem
567; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
568entry:
569  %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %c
570  %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %a, <2 x double> %b, <2 x double> %sub.i) #2
571  %1 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %a, <2 x double> %b, <2 x double> %c) #2
572  %2 = shufflevector <2 x double> %1, <2 x double> %0, <2 x i32> <i32 0, i32 3>
573  ret <2 x double> %2
574}
575
576define <8 x float> @test_mm256_fmadd_ps(<8 x float> %a, <8 x float> %b, <8 x float> %c) {
577; CHECK-FMA-LABEL: test_mm256_fmadd_ps:
578; CHECK-FMA:       # %bb.0: # %entry
579; CHECK-FMA-NEXT:    vfmadd213ps %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0xa8,0xc2]
580; CHECK-FMA-NEXT:    # ymm0 = (ymm1 * ymm0) + ymm2
581; CHECK-FMA-NEXT:    retq # encoding: [0xc3]
582;
583; CHECK-AVX512VL-LABEL: test_mm256_fmadd_ps:
584; CHECK-AVX512VL:       # %bb.0: # %entry
585; CHECK-AVX512VL-NEXT:    vfmadd213ps %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0xa8,0xc2]
586; CHECK-AVX512VL-NEXT:    # ymm0 = (ymm1 * ymm0) + ymm2
587; CHECK-AVX512VL-NEXT:    retq # encoding: [0xc3]
588;
589; CHECK-FMA-WIN-LABEL: test_mm256_fmadd_ps:
590; CHECK-FMA-WIN:       # %bb.0: # %entry
591; CHECK-FMA-WIN-NEXT:    vmovaps (%rcx), %ymm1 # encoding: [0xc5,0xfc,0x28,0x09]
592; CHECK-FMA-WIN-NEXT:    vmovaps (%rdx), %ymm0 # encoding: [0xc5,0xfc,0x28,0x02]
593; CHECK-FMA-WIN-NEXT:    vfmadd213ps (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0x75,0xa8,0x00]
594; CHECK-FMA-WIN-NEXT:    # ymm0 = (ymm1 * ymm0) + mem
595; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
596entry:
597  %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c) #2
598  ret <8 x float> %0
599}
600
601define <4 x double> @test_mm256_fmadd_pd(<4 x double> %a, <4 x double> %b, <4 x double> %c) {
602; CHECK-FMA-LABEL: test_mm256_fmadd_pd:
603; CHECK-FMA:       # %bb.0: # %entry
604; CHECK-FMA-NEXT:    vfmadd213pd %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0xf5,0xa8,0xc2]
605; CHECK-FMA-NEXT:    # ymm0 = (ymm1 * ymm0) + ymm2
606; CHECK-FMA-NEXT:    retq # encoding: [0xc3]
607;
608; CHECK-AVX512VL-LABEL: test_mm256_fmadd_pd:
609; CHECK-AVX512VL:       # %bb.0: # %entry
610; CHECK-AVX512VL-NEXT:    vfmadd213pd %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf5,0xa8,0xc2]
611; CHECK-AVX512VL-NEXT:    # ymm0 = (ymm1 * ymm0) + ymm2
612; CHECK-AVX512VL-NEXT:    retq # encoding: [0xc3]
613;
614; CHECK-FMA-WIN-LABEL: test_mm256_fmadd_pd:
615; CHECK-FMA-WIN:       # %bb.0: # %entry
616; CHECK-FMA-WIN-NEXT:    vmovapd (%rcx), %ymm1 # encoding: [0xc5,0xfd,0x28,0x09]
617; CHECK-FMA-WIN-NEXT:    vmovapd (%rdx), %ymm0 # encoding: [0xc5,0xfd,0x28,0x02]
618; CHECK-FMA-WIN-NEXT:    vfmadd213pd (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0xf5,0xa8,0x00]
619; CHECK-FMA-WIN-NEXT:    # ymm0 = (ymm1 * ymm0) + mem
620; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
621entry:
622  %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %a, <4 x double> %b, <4 x double> %c) #2
623  ret <4 x double> %0
624}
625
626define <8 x float> @test_mm256_fmsub_ps(<8 x float> %a, <8 x float> %b, <8 x float> %c) {
627; CHECK-FMA-LABEL: test_mm256_fmsub_ps:
628; CHECK-FMA:       # %bb.0: # %entry
629; CHECK-FMA-NEXT:    vfmsub213ps %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0xaa,0xc2]
630; CHECK-FMA-NEXT:    # ymm0 = (ymm1 * ymm0) - ymm2
631; CHECK-FMA-NEXT:    retq # encoding: [0xc3]
632;
633; CHECK-AVX512VL-LABEL: test_mm256_fmsub_ps:
634; CHECK-AVX512VL:       # %bb.0: # %entry
635; CHECK-AVX512VL-NEXT:    vfmsub213ps %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0xaa,0xc2]
636; CHECK-AVX512VL-NEXT:    # ymm0 = (ymm1 * ymm0) - ymm2
637; CHECK-AVX512VL-NEXT:    retq # encoding: [0xc3]
638;
639; CHECK-FMA-WIN-LABEL: test_mm256_fmsub_ps:
640; CHECK-FMA-WIN:       # %bb.0: # %entry
641; CHECK-FMA-WIN-NEXT:    vmovaps (%rcx), %ymm1 # encoding: [0xc5,0xfc,0x28,0x09]
642; CHECK-FMA-WIN-NEXT:    vmovaps (%rdx), %ymm0 # encoding: [0xc5,0xfc,0x28,0x02]
643; CHECK-FMA-WIN-NEXT:    vfmsub213ps (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0x75,0xaa,0x00]
644; CHECK-FMA-WIN-NEXT:    # ymm0 = (ymm1 * ymm0) - mem
645; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
646entry:
647  %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %c
648  %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %sub.i) #2
649  ret <8 x float> %0
650}
651
652define <4 x double> @test_mm256_fmsub_pd(<4 x double> %a, <4 x double> %b, <4 x double> %c) {
653; CHECK-FMA-LABEL: test_mm256_fmsub_pd:
654; CHECK-FMA:       # %bb.0: # %entry
655; CHECK-FMA-NEXT:    vfmsub213pd %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0xf5,0xaa,0xc2]
656; CHECK-FMA-NEXT:    # ymm0 = (ymm1 * ymm0) - ymm2
657; CHECK-FMA-NEXT:    retq # encoding: [0xc3]
658;
659; CHECK-AVX512VL-LABEL: test_mm256_fmsub_pd:
660; CHECK-AVX512VL:       # %bb.0: # %entry
661; CHECK-AVX512VL-NEXT:    vfmsub213pd %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf5,0xaa,0xc2]
662; CHECK-AVX512VL-NEXT:    # ymm0 = (ymm1 * ymm0) - ymm2
663; CHECK-AVX512VL-NEXT:    retq # encoding: [0xc3]
664;
665; CHECK-FMA-WIN-LABEL: test_mm256_fmsub_pd:
666; CHECK-FMA-WIN:       # %bb.0: # %entry
667; CHECK-FMA-WIN-NEXT:    vmovapd (%rcx), %ymm1 # encoding: [0xc5,0xfd,0x28,0x09]
668; CHECK-FMA-WIN-NEXT:    vmovapd (%rdx), %ymm0 # encoding: [0xc5,0xfd,0x28,0x02]
669; CHECK-FMA-WIN-NEXT:    vfmsub213pd (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0xf5,0xaa,0x00]
670; CHECK-FMA-WIN-NEXT:    # ymm0 = (ymm1 * ymm0) - mem
671; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
672entry:
673  %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %c
674  %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %a, <4 x double> %b, <4 x double> %sub.i) #2
675  ret <4 x double> %0
676}
677
678define <8 x float> @test_mm256_fnmadd_ps(<8 x float> %a, <8 x float> %b, <8 x float> %c) {
679; CHECK-FMA-LABEL: test_mm256_fnmadd_ps:
680; CHECK-FMA:       # %bb.0: # %entry
681; CHECK-FMA-NEXT:    vfnmadd213ps %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0xac,0xc2]
682; CHECK-FMA-NEXT:    # ymm0 = -(ymm1 * ymm0) + ymm2
683; CHECK-FMA-NEXT:    retq # encoding: [0xc3]
684;
685; CHECK-AVX512VL-LABEL: test_mm256_fnmadd_ps:
686; CHECK-AVX512VL:       # %bb.0: # %entry
687; CHECK-AVX512VL-NEXT:    vfnmadd213ps %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0xac,0xc2]
688; CHECK-AVX512VL-NEXT:    # ymm0 = -(ymm1 * ymm0) + ymm2
689; CHECK-AVX512VL-NEXT:    retq # encoding: [0xc3]
690;
691; CHECK-FMA-WIN-LABEL: test_mm256_fnmadd_ps:
692; CHECK-FMA-WIN:       # %bb.0: # %entry
693; CHECK-FMA-WIN-NEXT:    vmovaps (%rcx), %ymm1 # encoding: [0xc5,0xfc,0x28,0x09]
694; CHECK-FMA-WIN-NEXT:    vmovaps (%rdx), %ymm0 # encoding: [0xc5,0xfc,0x28,0x02]
695; CHECK-FMA-WIN-NEXT:    vfnmadd213ps (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0x75,0xac,0x00]
696; CHECK-FMA-WIN-NEXT:    # ymm0 = -(ymm1 * ymm0) + mem
697; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
698entry:
699  %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %a
700  %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %sub.i, <8 x float> %b, <8 x float> %c) #2
701  ret <8 x float> %0
702}
703
704define <4 x double> @test_mm256_fnmadd_pd(<4 x double> %a, <4 x double> %b, <4 x double> %c) {
705; CHECK-FMA-LABEL: test_mm256_fnmadd_pd:
706; CHECK-FMA:       # %bb.0: # %entry
707; CHECK-FMA-NEXT:    vfnmadd213pd %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0xf5,0xac,0xc2]
708; CHECK-FMA-NEXT:    # ymm0 = -(ymm1 * ymm0) + ymm2
709; CHECK-FMA-NEXT:    retq # encoding: [0xc3]
710;
711; CHECK-AVX512VL-LABEL: test_mm256_fnmadd_pd:
712; CHECK-AVX512VL:       # %bb.0: # %entry
713; CHECK-AVX512VL-NEXT:    vfnmadd213pd %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf5,0xac,0xc2]
714; CHECK-AVX512VL-NEXT:    # ymm0 = -(ymm1 * ymm0) + ymm2
715; CHECK-AVX512VL-NEXT:    retq # encoding: [0xc3]
716;
717; CHECK-FMA-WIN-LABEL: test_mm256_fnmadd_pd:
718; CHECK-FMA-WIN:       # %bb.0: # %entry
719; CHECK-FMA-WIN-NEXT:    vmovapd (%rcx), %ymm1 # encoding: [0xc5,0xfd,0x28,0x09]
720; CHECK-FMA-WIN-NEXT:    vmovapd (%rdx), %ymm0 # encoding: [0xc5,0xfd,0x28,0x02]
721; CHECK-FMA-WIN-NEXT:    vfnmadd213pd (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0xf5,0xac,0x00]
722; CHECK-FMA-WIN-NEXT:    # ymm0 = -(ymm1 * ymm0) + mem
723; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
724entry:
725  %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %a
726  %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %sub.i, <4 x double> %b, <4 x double> %c) #2
727  ret <4 x double> %0
728}
729
730define <8 x float> @test_mm256_fnmsub_ps(<8 x float> %a, <8 x float> %b, <8 x float> %c) {
731; CHECK-FMA-LABEL: test_mm256_fnmsub_ps:
732; CHECK-FMA:       # %bb.0: # %entry
733; CHECK-FMA-NEXT:    vfnmsub213ps %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0xae,0xc2]
734; CHECK-FMA-NEXT:    # ymm0 = -(ymm1 * ymm0) - ymm2
735; CHECK-FMA-NEXT:    retq # encoding: [0xc3]
736;
737; CHECK-AVX512VL-LABEL: test_mm256_fnmsub_ps:
738; CHECK-AVX512VL:       # %bb.0: # %entry
739; CHECK-AVX512VL-NEXT:    vfnmsub213ps %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0xae,0xc2]
740; CHECK-AVX512VL-NEXT:    # ymm0 = -(ymm1 * ymm0) - ymm2
741; CHECK-AVX512VL-NEXT:    retq # encoding: [0xc3]
742;
743; CHECK-FMA-WIN-LABEL: test_mm256_fnmsub_ps:
744; CHECK-FMA-WIN:       # %bb.0: # %entry
745; CHECK-FMA-WIN-NEXT:    vmovaps (%rcx), %ymm1 # encoding: [0xc5,0xfc,0x28,0x09]
746; CHECK-FMA-WIN-NEXT:    vmovaps (%rdx), %ymm0 # encoding: [0xc5,0xfc,0x28,0x02]
747; CHECK-FMA-WIN-NEXT:    vfnmsub213ps (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0x75,0xae,0x00]
748; CHECK-FMA-WIN-NEXT:    # ymm0 = -(ymm1 * ymm0) - mem
749; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
750entry:
751  %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %a
752  %sub1.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %c
753  %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %sub.i, <8 x float> %b, <8 x float> %sub1.i) #2
754  ret <8 x float> %0
755}
756
757define <4 x double> @test_mm256_fnmsub_pd(<4 x double> %a, <4 x double> %b, <4 x double> %c) {
758; CHECK-FMA-LABEL: test_mm256_fnmsub_pd:
759; CHECK-FMA:       # %bb.0: # %entry
760; CHECK-FMA-NEXT:    vfnmsub213pd %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0xf5,0xae,0xc2]
761; CHECK-FMA-NEXT:    # ymm0 = -(ymm1 * ymm0) - ymm2
762; CHECK-FMA-NEXT:    retq # encoding: [0xc3]
763;
764; CHECK-AVX512VL-LABEL: test_mm256_fnmsub_pd:
765; CHECK-AVX512VL:       # %bb.0: # %entry
766; CHECK-AVX512VL-NEXT:    vfnmsub213pd %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf5,0xae,0xc2]
767; CHECK-AVX512VL-NEXT:    # ymm0 = -(ymm1 * ymm0) - ymm2
768; CHECK-AVX512VL-NEXT:    retq # encoding: [0xc3]
769;
770; CHECK-FMA-WIN-LABEL: test_mm256_fnmsub_pd:
771; CHECK-FMA-WIN:       # %bb.0: # %entry
772; CHECK-FMA-WIN-NEXT:    vmovapd (%rcx), %ymm1 # encoding: [0xc5,0xfd,0x28,0x09]
773; CHECK-FMA-WIN-NEXT:    vmovapd (%rdx), %ymm0 # encoding: [0xc5,0xfd,0x28,0x02]
774; CHECK-FMA-WIN-NEXT:    vfnmsub213pd (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0xf5,0xae,0x00]
775; CHECK-FMA-WIN-NEXT:    # ymm0 = -(ymm1 * ymm0) - mem
776; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
777entry:
778  %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %a
779  %sub1.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %c
780  %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %sub.i, <4 x double> %b, <4 x double> %sub1.i) #2
781  ret <4 x double> %0
782}
783
784define <8 x float> @test_mm256_fmaddsub_ps(<8 x float> %a, <8 x float> %b, <8 x float> %c) {
785; CHECK-FMA-LABEL: test_mm256_fmaddsub_ps:
786; CHECK-FMA:       # %bb.0: # %entry
787; CHECK-FMA-NEXT:    vfmaddsub213ps %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0xa6,0xc2]
788; CHECK-FMA-NEXT:    # ymm0 = (ymm1 * ymm0) +/- ymm2
789; CHECK-FMA-NEXT:    retq # encoding: [0xc3]
790;
791; CHECK-AVX512VL-LABEL: test_mm256_fmaddsub_ps:
792; CHECK-AVX512VL:       # %bb.0: # %entry
793; CHECK-AVX512VL-NEXT:    vfmaddsub213ps %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0xa6,0xc2]
794; CHECK-AVX512VL-NEXT:    # ymm0 = (ymm1 * ymm0) +/- ymm2
795; CHECK-AVX512VL-NEXT:    retq # encoding: [0xc3]
796;
797; CHECK-FMA-WIN-LABEL: test_mm256_fmaddsub_ps:
798; CHECK-FMA-WIN:       # %bb.0: # %entry
799; CHECK-FMA-WIN-NEXT:    vmovaps (%rcx), %ymm1 # encoding: [0xc5,0xfc,0x28,0x09]
800; CHECK-FMA-WIN-NEXT:    vmovaps (%rdx), %ymm0 # encoding: [0xc5,0xfc,0x28,0x02]
801; CHECK-FMA-WIN-NEXT:    vfmaddsub213ps (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0x75,0xa6,0x00]
802; CHECK-FMA-WIN-NEXT:    # ymm0 = (ymm1 * ymm0) +/- mem
803; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
804entry:
805  %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c) #2
806  %1 = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %c
807  %2 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %1) #2
808  %3 = shufflevector <8 x float> %2, <8 x float> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
809  ret <8 x float> %3
810}
811
812define <4 x double> @test_mm256_fmaddsub_pd(<4 x double> %a, <4 x double> %b, <4 x double> %c) {
813; CHECK-FMA-LABEL: test_mm256_fmaddsub_pd:
814; CHECK-FMA:       # %bb.0: # %entry
815; CHECK-FMA-NEXT:    vfmaddsub213pd %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0xf5,0xa6,0xc2]
816; CHECK-FMA-NEXT:    # ymm0 = (ymm1 * ymm0) +/- ymm2
817; CHECK-FMA-NEXT:    retq # encoding: [0xc3]
818;
819; CHECK-AVX512VL-LABEL: test_mm256_fmaddsub_pd:
820; CHECK-AVX512VL:       # %bb.0: # %entry
821; CHECK-AVX512VL-NEXT:    vfmaddsub213pd %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf5,0xa6,0xc2]
822; CHECK-AVX512VL-NEXT:    # ymm0 = (ymm1 * ymm0) +/- ymm2
823; CHECK-AVX512VL-NEXT:    retq # encoding: [0xc3]
824;
825; CHECK-FMA-WIN-LABEL: test_mm256_fmaddsub_pd:
826; CHECK-FMA-WIN:       # %bb.0: # %entry
827; CHECK-FMA-WIN-NEXT:    vmovapd (%rcx), %ymm1 # encoding: [0xc5,0xfd,0x28,0x09]
828; CHECK-FMA-WIN-NEXT:    vmovapd (%rdx), %ymm0 # encoding: [0xc5,0xfd,0x28,0x02]
829; CHECK-FMA-WIN-NEXT:    vfmaddsub213pd (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0xf5,0xa6,0x00]
830; CHECK-FMA-WIN-NEXT:    # ymm0 = (ymm1 * ymm0) +/- mem
831; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
832entry:
833  %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %a, <4 x double> %b, <4 x double> %c) #2
834  %1 = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %c
835  %2 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %a, <4 x double> %b, <4 x double> %1) #2
836  %3 = shufflevector <4 x double> %2, <4 x double> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
837  ret <4 x double> %3
838}
839
840define <8 x float> @test_mm256_fmsubadd_ps(<8 x float> %a, <8 x float> %b, <8 x float> %c) {
841; CHECK-FMA-LABEL: test_mm256_fmsubadd_ps:
842; CHECK-FMA:       # %bb.0: # %entry
843; CHECK-FMA-NEXT:    vfmsubadd213ps %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0xa7,0xc2]
844; CHECK-FMA-NEXT:    # ymm0 = (ymm1 * ymm0) -/+ ymm2
845; CHECK-FMA-NEXT:    retq # encoding: [0xc3]
846;
847; CHECK-AVX512VL-LABEL: test_mm256_fmsubadd_ps:
848; CHECK-AVX512VL:       # %bb.0: # %entry
849; CHECK-AVX512VL-NEXT:    vfmsubadd213ps %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0xa7,0xc2]
850; CHECK-AVX512VL-NEXT:    # ymm0 = (ymm1 * ymm0) -/+ ymm2
851; CHECK-AVX512VL-NEXT:    retq # encoding: [0xc3]
852;
853; CHECK-FMA-WIN-LABEL: test_mm256_fmsubadd_ps:
854; CHECK-FMA-WIN:       # %bb.0: # %entry
855; CHECK-FMA-WIN-NEXT:    vmovaps (%rcx), %ymm1 # encoding: [0xc5,0xfc,0x28,0x09]
856; CHECK-FMA-WIN-NEXT:    vmovaps (%rdx), %ymm0 # encoding: [0xc5,0xfc,0x28,0x02]
857; CHECK-FMA-WIN-NEXT:    vfmsubadd213ps (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0x75,0xa7,0x00]
858; CHECK-FMA-WIN-NEXT:    # ymm0 = (ymm1 * ymm0) -/+ mem
859; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
860entry:
861  %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %c
862  %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %sub.i) #2
863  %1 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c) #2
864  %2 = shufflevector <8 x float> %1, <8 x float> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
865  ret <8 x float> %2
866}
867
868define <4 x double> @test_mm256_fmsubadd_pd(<4 x double> %a, <4 x double> %b, <4 x double> %c) {
869; CHECK-FMA-LABEL: test_mm256_fmsubadd_pd:
870; CHECK-FMA:       # %bb.0: # %entry
871; CHECK-FMA-NEXT:    vfmsubadd213pd %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0xf5,0xa7,0xc2]
872; CHECK-FMA-NEXT:    # ymm0 = (ymm1 * ymm0) -/+ ymm2
873; CHECK-FMA-NEXT:    retq # encoding: [0xc3]
874;
875; CHECK-AVX512VL-LABEL: test_mm256_fmsubadd_pd:
876; CHECK-AVX512VL:       # %bb.0: # %entry
877; CHECK-AVX512VL-NEXT:    vfmsubadd213pd %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf5,0xa7,0xc2]
878; CHECK-AVX512VL-NEXT:    # ymm0 = (ymm1 * ymm0) -/+ ymm2
879; CHECK-AVX512VL-NEXT:    retq # encoding: [0xc3]
880;
881; CHECK-FMA-WIN-LABEL: test_mm256_fmsubadd_pd:
882; CHECK-FMA-WIN:       # %bb.0: # %entry
883; CHECK-FMA-WIN-NEXT:    vmovapd (%rcx), %ymm1 # encoding: [0xc5,0xfd,0x28,0x09]
884; CHECK-FMA-WIN-NEXT:    vmovapd (%rdx), %ymm0 # encoding: [0xc5,0xfd,0x28,0x02]
885; CHECK-FMA-WIN-NEXT:    vfmsubadd213pd (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0xf5,0xa7,0x00]
886; CHECK-FMA-WIN-NEXT:    # ymm0 = (ymm1 * ymm0) -/+ mem
887; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
888entry:
889  %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %c
890  %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %a, <4 x double> %b, <4 x double> %sub.i) #2
891  %1 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %a, <4 x double> %b, <4 x double> %c) #2
892  %2 = shufflevector <4 x double> %1, <4 x double> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
893  ret <4 x double> %2
894}
895
896declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>) #1
897declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>) #1
898declare float @llvm.fma.f32(float, float, float) #1
899declare double @llvm.fma.f64(double, double, double) #1
900declare <8 x float> @llvm.fma.v8f32(<8 x float>, <8 x float>, <8 x float>) #1
901declare <4 x double> @llvm.fma.v4f64(<4 x double>, <4 x double>, <4 x double>) #1
902