10b57cec5SDimitry Andric//===-- X86InstrFMA.td - FMA Instruction Set ---------------*- tablegen -*-===//
20b57cec5SDimitry Andric//
30b57cec5SDimitry Andric// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
40b57cec5SDimitry Andric// See https://llvm.org/LICENSE.txt for license information.
50b57cec5SDimitry Andric// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
60b57cec5SDimitry Andric//
70b57cec5SDimitry Andric//===----------------------------------------------------------------------===//
80b57cec5SDimitry Andric//
90b57cec5SDimitry Andric// This file describes FMA (Fused Multiply-Add) instructions.
100b57cec5SDimitry Andric//
110b57cec5SDimitry Andric//===----------------------------------------------------------------------===//
120b57cec5SDimitry Andric
130b57cec5SDimitry Andric//===----------------------------------------------------------------------===//
140b57cec5SDimitry Andric// FMA3 - Intel 3 operand Fused Multiply-Add instructions
150b57cec5SDimitry Andric//===----------------------------------------------------------------------===//
160b57cec5SDimitry Andric
170b57cec5SDimitry Andric// For all FMA opcodes declared in fma3p_rm_* and fma3s_rm_* multiclasses
180b57cec5SDimitry Andric// defined below, both the register and memory variants are commutable.
190b57cec5SDimitry Andric// For the register form the commutable operands are 1, 2 and 3.
200b57cec5SDimitry Andric// For the memory variant the folded operand must be in 3. Thus,
210b57cec5SDimitry Andric// in that case, only the operands 1 and 2 can be swapped.
220b57cec5SDimitry Andric// Commuting some of operands may require the opcode change.
230b57cec5SDimitry Andric// FMA*213*:
240b57cec5SDimitry Andric//   operands 1 and 2 (memory & register forms): *213* --> *213*(no changes);
250b57cec5SDimitry Andric//   operands 1 and 3 (register forms only):     *213* --> *231*;
260b57cec5SDimitry Andric//   operands 2 and 3 (register forms only):     *213* --> *132*.
270b57cec5SDimitry Andric// FMA*132*:
280b57cec5SDimitry Andric//   operands 1 and 2 (memory & register forms): *132* --> *231*;
290b57cec5SDimitry Andric//   operands 1 and 3 (register forms only):     *132* --> *132*(no changes);
300b57cec5SDimitry Andric//   operands 2 and 3 (register forms only):     *132* --> *213*.
310b57cec5SDimitry Andric// FMA*231*:
320b57cec5SDimitry Andric//   operands 1 and 2 (memory & register forms): *231* --> *132*;
330b57cec5SDimitry Andric//   operands 1 and 3 (register forms only):     *231* --> *213*;
340b57cec5SDimitry Andric//   operands 2 and 3 (register forms only):     *231* --> *231*(no changes).
350b57cec5SDimitry Andric
360b57cec5SDimitry Andricmulticlass fma3p_rm_213<bits<8> opc, string OpcodeStr, RegisterClass RC,
370b57cec5SDimitry Andric                        ValueType VT, X86MemOperand x86memop, PatFrag MemFrag,
38fe6060f1SDimitry Andric                        SDPatternOperator Op, X86FoldableSchedWrite sched> {
390b57cec5SDimitry Andric  def r     : FMA3<opc, MRMSrcReg, (outs RC:$dst),
400b57cec5SDimitry Andric                   (ins RC:$src1, RC:$src2, RC:$src3),
410b57cec5SDimitry Andric                   !strconcat(OpcodeStr,
420b57cec5SDimitry Andric                              "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
430b57cec5SDimitry Andric                   [(set RC:$dst, (VT (Op RC:$src2, RC:$src1, RC:$src3)))]>,
440b57cec5SDimitry Andric                   Sched<[sched]>;
450b57cec5SDimitry Andric
460b57cec5SDimitry Andric  let mayLoad = 1 in
470b57cec5SDimitry Andric  def m     : FMA3<opc, MRMSrcMem, (outs RC:$dst),
480b57cec5SDimitry Andric                   (ins RC:$src1, RC:$src2, x86memop:$src3),
490b57cec5SDimitry Andric                   !strconcat(OpcodeStr,
500b57cec5SDimitry Andric                              "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
510b57cec5SDimitry Andric                   [(set RC:$dst, (VT (Op RC:$src2, RC:$src1,
520b57cec5SDimitry Andric                                          (MemFrag addr:$src3))))]>,
530b57cec5SDimitry Andric                   Sched<[sched.Folded, sched.ReadAfterFold, sched.ReadAfterFold]>;
540b57cec5SDimitry Andric}
550b57cec5SDimitry Andric
560b57cec5SDimitry Andricmulticlass fma3p_rm_231<bits<8> opc, string OpcodeStr, RegisterClass RC,
570b57cec5SDimitry Andric                        ValueType VT, X86MemOperand x86memop, PatFrag MemFrag,
58fe6060f1SDimitry Andric                        SDPatternOperator Op, X86FoldableSchedWrite sched> {
590b57cec5SDimitry Andric  let hasSideEffects = 0 in
600b57cec5SDimitry Andric  def r     : FMA3<opc, MRMSrcReg, (outs RC:$dst),
610b57cec5SDimitry Andric                   (ins RC:$src1, RC:$src2, RC:$src3),
620b57cec5SDimitry Andric                   !strconcat(OpcodeStr,
630b57cec5SDimitry Andric                              "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
640b57cec5SDimitry Andric                   []>, Sched<[sched]>;
650b57cec5SDimitry Andric
660b57cec5SDimitry Andric  let mayLoad = 1 in
670b57cec5SDimitry Andric  def m     : FMA3<opc, MRMSrcMem, (outs RC:$dst),
680b57cec5SDimitry Andric                   (ins RC:$src1, RC:$src2, x86memop:$src3),
690b57cec5SDimitry Andric                   !strconcat(OpcodeStr,
700b57cec5SDimitry Andric                              "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
710b57cec5SDimitry Andric                   [(set RC:$dst, (VT (Op RC:$src2, (MemFrag addr:$src3),
720b57cec5SDimitry Andric                                          RC:$src1)))]>,
730b57cec5SDimitry Andric                   Sched<[sched.Folded, sched.ReadAfterFold, sched.ReadAfterFold]>;
740b57cec5SDimitry Andric}
750b57cec5SDimitry Andric
760b57cec5SDimitry Andricmulticlass fma3p_rm_132<bits<8> opc, string OpcodeStr, RegisterClass RC,
770b57cec5SDimitry Andric                        ValueType VT, X86MemOperand x86memop, PatFrag MemFrag,
78fe6060f1SDimitry Andric                        SDPatternOperator Op, X86FoldableSchedWrite sched> {
790b57cec5SDimitry Andric  let hasSideEffects = 0 in
800b57cec5SDimitry Andric  def r     : FMA3<opc, MRMSrcReg, (outs RC:$dst),
810b57cec5SDimitry Andric                   (ins RC:$src1, RC:$src2, RC:$src3),
820b57cec5SDimitry Andric                   !strconcat(OpcodeStr,
830b57cec5SDimitry Andric                              "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
840b57cec5SDimitry Andric                   []>, Sched<[sched]>;
850b57cec5SDimitry Andric
860b57cec5SDimitry Andric  // Pattern is 312 order so that the load is in a different place from the
870b57cec5SDimitry Andric  // 213 and 231 patterns this helps tablegen's duplicate pattern detection.
880b57cec5SDimitry Andric  let mayLoad = 1 in
890b57cec5SDimitry Andric  def m     : FMA3<opc, MRMSrcMem, (outs RC:$dst),
900b57cec5SDimitry Andric                   (ins RC:$src1, RC:$src2, x86memop:$src3),
910b57cec5SDimitry Andric                   !strconcat(OpcodeStr,
920b57cec5SDimitry Andric                              "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
930b57cec5SDimitry Andric                   [(set RC:$dst, (VT (Op (MemFrag addr:$src3), RC:$src1,
940b57cec5SDimitry Andric                                          RC:$src2)))]>,
950b57cec5SDimitry Andric                   Sched<[sched.Folded, sched.ReadAfterFold, sched.ReadAfterFold]>;
960b57cec5SDimitry Andric}
970b57cec5SDimitry Andric
98480093f4SDimitry Andriclet Constraints = "$src1 = $dst", hasSideEffects = 0, isCommutable = 1,
99480093f4SDimitry Andric    Uses = [MXCSR], mayRaiseFPException = 1 in
1000b57cec5SDimitry Andricmulticlass fma3p_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231,
1010b57cec5SDimitry Andric                       string OpcodeStr, string PackTy, string Suff,
1020b57cec5SDimitry Andric                       PatFrag MemFrag128, PatFrag MemFrag256,
103fe6060f1SDimitry Andric                       SDPatternOperator Op, ValueType OpTy128, ValueType OpTy256,
1040b57cec5SDimitry Andric                       X86SchedWriteWidths sched> {
1050b57cec5SDimitry Andric  defm NAME#213#Suff : fma3p_rm_213<opc213, !strconcat(OpcodeStr, "213", PackTy),
1060b57cec5SDimitry Andric                                    VR128, OpTy128, f128mem, MemFrag128, Op, sched.XMM>;
1070b57cec5SDimitry Andric  defm NAME#231#Suff : fma3p_rm_231<opc231, !strconcat(OpcodeStr, "231", PackTy),
1080b57cec5SDimitry Andric                                    VR128, OpTy128, f128mem, MemFrag128, Op, sched.XMM>;
1090b57cec5SDimitry Andric  defm NAME#132#Suff : fma3p_rm_132<opc132, !strconcat(OpcodeStr, "132", PackTy),
1100b57cec5SDimitry Andric                                    VR128, OpTy128, f128mem, MemFrag128, Op, sched.XMM>;
1110b57cec5SDimitry Andric
1120b57cec5SDimitry Andric  defm NAME#213#Suff#Y : fma3p_rm_213<opc213, !strconcat(OpcodeStr, "213", PackTy),
1130b57cec5SDimitry Andric                                      VR256, OpTy256, f256mem, MemFrag256, Op, sched.YMM>,
1140b57cec5SDimitry Andric                                      VEX_L;
1150b57cec5SDimitry Andric  defm NAME#231#Suff#Y : fma3p_rm_231<opc231, !strconcat(OpcodeStr, "231", PackTy),
1160b57cec5SDimitry Andric                                      VR256, OpTy256, f256mem, MemFrag256, Op, sched.YMM>,
1170b57cec5SDimitry Andric                                      VEX_L;
1180b57cec5SDimitry Andric  defm NAME#132#Suff#Y : fma3p_rm_132<opc132, !strconcat(OpcodeStr, "132", PackTy),
1190b57cec5SDimitry Andric                                      VR256, OpTy256, f256mem, MemFrag256, Op, sched.YMM>,
1200b57cec5SDimitry Andric                                      VEX_L;
1210b57cec5SDimitry Andric}
1220b57cec5SDimitry Andric
1230b57cec5SDimitry Andric// Fused Multiply-Add
1240b57cec5SDimitry Andriclet ExeDomain = SSEPackedSingle in {
1250b57cec5SDimitry Andric  defm VFMADD    : fma3p_forms<0x98, 0xA8, 0xB8, "vfmadd", "ps", "PS",
126e8d8bef9SDimitry Andric                               loadv4f32, loadv8f32, any_fma, v4f32, v8f32,
1270b57cec5SDimitry Andric                               SchedWriteFMA>;
1280b57cec5SDimitry Andric  defm VFMSUB    : fma3p_forms<0x9A, 0xAA, 0xBA, "vfmsub", "ps", "PS",
1295ffd83dbSDimitry Andric                               loadv4f32, loadv8f32, X86any_Fmsub, v4f32, v8f32,
1300b57cec5SDimitry Andric                               SchedWriteFMA>;
1310b57cec5SDimitry Andric  defm VFMADDSUB : fma3p_forms<0x96, 0xA6, 0xB6, "vfmaddsub", "ps", "PS",
1320b57cec5SDimitry Andric                               loadv4f32, loadv8f32, X86Fmaddsub, v4f32, v8f32,
1330b57cec5SDimitry Andric                               SchedWriteFMA>;
1340b57cec5SDimitry Andric  defm VFMSUBADD : fma3p_forms<0x97, 0xA7, 0xB7, "vfmsubadd", "ps", "PS",
1350b57cec5SDimitry Andric                               loadv4f32, loadv8f32, X86Fmsubadd, v4f32, v8f32,
1360b57cec5SDimitry Andric                               SchedWriteFMA>;
1370b57cec5SDimitry Andric}
1380b57cec5SDimitry Andric
1390b57cec5SDimitry Andriclet ExeDomain = SSEPackedDouble in {
1400b57cec5SDimitry Andric  defm VFMADD    : fma3p_forms<0x98, 0xA8, 0xB8, "vfmadd", "pd", "PD",
141e8d8bef9SDimitry Andric                               loadv2f64, loadv4f64, any_fma, v2f64,
14206c3fb27SDimitry Andric                               v4f64, SchedWriteFMA>, REX_W;
1430b57cec5SDimitry Andric  defm VFMSUB    : fma3p_forms<0x9A, 0xAA, 0xBA, "vfmsub", "pd", "PD",
1445ffd83dbSDimitry Andric                               loadv2f64, loadv4f64, X86any_Fmsub, v2f64,
14506c3fb27SDimitry Andric                               v4f64, SchedWriteFMA>, REX_W;
1460b57cec5SDimitry Andric  defm VFMADDSUB : fma3p_forms<0x96, 0xA6, 0xB6, "vfmaddsub", "pd", "PD",
1470b57cec5SDimitry Andric                               loadv2f64, loadv4f64, X86Fmaddsub,
14806c3fb27SDimitry Andric                               v2f64, v4f64, SchedWriteFMA>, REX_W;
1490b57cec5SDimitry Andric  defm VFMSUBADD : fma3p_forms<0x97, 0xA7, 0xB7, "vfmsubadd", "pd", "PD",
1500b57cec5SDimitry Andric                               loadv2f64, loadv4f64, X86Fmsubadd,
15106c3fb27SDimitry Andric                               v2f64, v4f64, SchedWriteFMA>, REX_W;
1520b57cec5SDimitry Andric}
1530b57cec5SDimitry Andric
1540b57cec5SDimitry Andric// Fused Negative Multiply-Add
1550b57cec5SDimitry Andriclet ExeDomain = SSEPackedSingle in {
1560b57cec5SDimitry Andric  defm VFNMADD : fma3p_forms<0x9C, 0xAC, 0xBC, "vfnmadd", "ps", "PS", loadv4f32,
1575ffd83dbSDimitry Andric                             loadv8f32, X86any_Fnmadd, v4f32, v8f32, SchedWriteFMA>;
1580b57cec5SDimitry Andric  defm VFNMSUB : fma3p_forms<0x9E, 0xAE, 0xBE, "vfnmsub", "ps", "PS", loadv4f32,
1595ffd83dbSDimitry Andric                             loadv8f32, X86any_Fnmsub, v4f32, v8f32, SchedWriteFMA>;
1600b57cec5SDimitry Andric}
1610b57cec5SDimitry Andriclet ExeDomain = SSEPackedDouble in {
1620b57cec5SDimitry Andric  defm VFNMADD : fma3p_forms<0x9C, 0xAC, 0xBC, "vfnmadd", "pd", "PD", loadv2f64,
16306c3fb27SDimitry Andric                             loadv4f64, X86any_Fnmadd, v2f64, v4f64, SchedWriteFMA>, REX_W;
1640b57cec5SDimitry Andric  defm VFNMSUB : fma3p_forms<0x9E, 0xAE, 0xBE, "vfnmsub", "pd", "PD", loadv2f64,
16506c3fb27SDimitry Andric                             loadv4f64, X86any_Fnmsub, v2f64, v4f64, SchedWriteFMA>, REX_W;
1660b57cec5SDimitry Andric}
1670b57cec5SDimitry Andric
1680b57cec5SDimitry Andric// All source register operands of FMA opcodes defined in fma3s_rm multiclass
1695ffd83dbSDimitry Andric// can be commuted. In many cases such commute transformation requires an opcode
1700b57cec5SDimitry Andric// adjustment, for example, commuting the operands 1 and 2 in FMA*132 form
1710b57cec5SDimitry Andric// would require an opcode change to FMA*231:
1720b57cec5SDimitry Andric//     FMA*132* reg1, reg2, reg3; // reg1 * reg3 + reg2;
1730b57cec5SDimitry Andric//     -->
1740b57cec5SDimitry Andric//     FMA*231* reg2, reg1, reg3; // reg1 * reg3 + reg2;
1750b57cec5SDimitry Andric// Please see more detailed comment at the very beginning of the section
1760b57cec5SDimitry Andric// defining FMA3 opcodes above.
1770b57cec5SDimitry Andricmulticlass fma3s_rm_213<bits<8> opc, string OpcodeStr,
1780b57cec5SDimitry Andric                        X86MemOperand x86memop, RegisterClass RC,
1790b57cec5SDimitry Andric                        SDPatternOperator OpNode,
1800b57cec5SDimitry Andric                        X86FoldableSchedWrite sched> {
1810b57cec5SDimitry Andric  def r : FMA3S<opc, MRMSrcReg, (outs RC:$dst),
1820b57cec5SDimitry Andric                (ins RC:$src1, RC:$src2, RC:$src3),
1830b57cec5SDimitry Andric                !strconcat(OpcodeStr,
1840b57cec5SDimitry Andric                           "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
1850b57cec5SDimitry Andric                [(set RC:$dst, (OpNode RC:$src2, RC:$src1, RC:$src3))]>,
1860b57cec5SDimitry Andric                Sched<[sched]>;
1870b57cec5SDimitry Andric
1880b57cec5SDimitry Andric  let mayLoad = 1 in
1890b57cec5SDimitry Andric  def m : FMA3S<opc, MRMSrcMem, (outs RC:$dst),
1900b57cec5SDimitry Andric                (ins RC:$src1, RC:$src2, x86memop:$src3),
1910b57cec5SDimitry Andric                !strconcat(OpcodeStr,
1920b57cec5SDimitry Andric                           "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
1930b57cec5SDimitry Andric                [(set RC:$dst,
1940b57cec5SDimitry Andric                  (OpNode RC:$src2, RC:$src1, (load addr:$src3)))]>,
1950b57cec5SDimitry Andric                Sched<[sched.Folded, sched.ReadAfterFold, sched.ReadAfterFold]>;
1960b57cec5SDimitry Andric}
1970b57cec5SDimitry Andric
1980b57cec5SDimitry Andricmulticlass fma3s_rm_231<bits<8> opc, string OpcodeStr,
1990b57cec5SDimitry Andric                        X86MemOperand x86memop, RegisterClass RC,
2000b57cec5SDimitry Andric                        SDPatternOperator OpNode, X86FoldableSchedWrite sched> {
2010b57cec5SDimitry Andric  let hasSideEffects = 0 in
2020b57cec5SDimitry Andric  def r : FMA3S<opc, MRMSrcReg, (outs RC:$dst),
2030b57cec5SDimitry Andric                (ins RC:$src1, RC:$src2, RC:$src3),
2040b57cec5SDimitry Andric                !strconcat(OpcodeStr,
2050b57cec5SDimitry Andric                           "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
2060b57cec5SDimitry Andric                []>, Sched<[sched]>;
2070b57cec5SDimitry Andric
2080b57cec5SDimitry Andric  let mayLoad = 1 in
2090b57cec5SDimitry Andric  def m : FMA3S<opc, MRMSrcMem, (outs RC:$dst),
2100b57cec5SDimitry Andric                (ins RC:$src1, RC:$src2, x86memop:$src3),
2110b57cec5SDimitry Andric                !strconcat(OpcodeStr,
2120b57cec5SDimitry Andric                           "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
2130b57cec5SDimitry Andric                [(set RC:$dst,
2140b57cec5SDimitry Andric                  (OpNode RC:$src2, (load addr:$src3), RC:$src1))]>,
2150b57cec5SDimitry Andric                Sched<[sched.Folded, sched.ReadAfterFold, sched.ReadAfterFold]>;
2160b57cec5SDimitry Andric}
2170b57cec5SDimitry Andric
2180b57cec5SDimitry Andricmulticlass fma3s_rm_132<bits<8> opc, string OpcodeStr,
2190b57cec5SDimitry Andric                        X86MemOperand x86memop, RegisterClass RC,
2200b57cec5SDimitry Andric                        SDPatternOperator OpNode, X86FoldableSchedWrite sched> {
2210b57cec5SDimitry Andric  let hasSideEffects = 0 in
2220b57cec5SDimitry Andric  def r : FMA3S<opc, MRMSrcReg, (outs RC:$dst),
2230b57cec5SDimitry Andric                (ins RC:$src1, RC:$src2, RC:$src3),
2240b57cec5SDimitry Andric                !strconcat(OpcodeStr,
2250b57cec5SDimitry Andric                           "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
2260b57cec5SDimitry Andric                []>, Sched<[sched]>;
2270b57cec5SDimitry Andric
2280b57cec5SDimitry Andric  // Pattern is 312 order so that the load is in a different place from the
2290b57cec5SDimitry Andric  // 213 and 231 patterns this helps tablegen's duplicate pattern detection.
2300b57cec5SDimitry Andric  let mayLoad = 1 in
2310b57cec5SDimitry Andric  def m : FMA3S<opc, MRMSrcMem, (outs RC:$dst),
2320b57cec5SDimitry Andric                (ins RC:$src1, RC:$src2, x86memop:$src3),
2330b57cec5SDimitry Andric                !strconcat(OpcodeStr,
2340b57cec5SDimitry Andric                           "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
2350b57cec5SDimitry Andric                [(set RC:$dst,
2360b57cec5SDimitry Andric                  (OpNode (load addr:$src3), RC:$src1, RC:$src2))]>,
2370b57cec5SDimitry Andric                Sched<[sched.Folded, sched.ReadAfterFold, sched.ReadAfterFold]>;
2380b57cec5SDimitry Andric}
2390b57cec5SDimitry Andric
2400b57cec5SDimitry Andriclet Constraints = "$src1 = $dst", isCommutable = 1, isCodeGenOnly = 1,
241480093f4SDimitry Andric    hasSideEffects = 0, Uses = [MXCSR], mayRaiseFPException = 1 in
2420b57cec5SDimitry Andricmulticlass fma3s_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231,
2430b57cec5SDimitry Andric                       string OpStr, string PackTy, string Suff,
244fe6060f1SDimitry Andric                       SDPatternOperator OpNode, RegisterClass RC,
2450b57cec5SDimitry Andric                       X86MemOperand x86memop, X86FoldableSchedWrite sched> {
2460b57cec5SDimitry Andric  defm NAME#213#Suff : fma3s_rm_213<opc213, !strconcat(OpStr, "213", PackTy),
2470b57cec5SDimitry Andric                                    x86memop, RC, OpNode, sched>;
2480b57cec5SDimitry Andric  defm NAME#231#Suff : fma3s_rm_231<opc231, !strconcat(OpStr, "231", PackTy),
2490b57cec5SDimitry Andric                                    x86memop, RC, OpNode, sched>;
2500b57cec5SDimitry Andric  defm NAME#132#Suff : fma3s_rm_132<opc132, !strconcat(OpStr, "132", PackTy),
2510b57cec5SDimitry Andric                                    x86memop, RC, OpNode, sched>;
2520b57cec5SDimitry Andric}
2530b57cec5SDimitry Andric
2540b57cec5SDimitry Andric// These FMA*_Int instructions are defined specially for being used when
2550b57cec5SDimitry Andric// the scalar FMA intrinsics are lowered to machine instructions, and in that
2560b57cec5SDimitry Andric// sense, they are similar to existing ADD*_Int, SUB*_Int, MUL*_Int, etc.
2570b57cec5SDimitry Andric// instructions.
2580b57cec5SDimitry Andric//
2590b57cec5SDimitry Andric// All of the FMA*_Int opcodes are defined as commutable here.
2600b57cec5SDimitry Andric// Commuting the 2nd and 3rd source register operands of FMAs is quite trivial
2610b57cec5SDimitry Andric// and the corresponding optimizations have been developed.
2620b57cec5SDimitry Andric// Commuting the 1st operand of FMA*_Int requires some additional analysis,
2630b57cec5SDimitry Andric// the commute optimization is legal only if all users of FMA*_Int use only
2640b57cec5SDimitry Andric// the lowest element of the FMA*_Int instruction. Even though such analysis
2650b57cec5SDimitry Andric// may be not implemented yet we allow the routines doing the actual commute
2660b57cec5SDimitry Andric// transformation to decide if one or another instruction is commutable or not.
267480093f4SDimitry Andriclet Constraints = "$src1 = $dst", isCommutable = 1, hasSideEffects = 0,
268480093f4SDimitry Andric    Uses = [MXCSR], mayRaiseFPException = 1 in
2690b57cec5SDimitry Andricmulticlass fma3s_rm_int<bits<8> opc, string OpcodeStr,
2700b57cec5SDimitry Andric                        Operand memopr, RegisterClass RC,
2710b57cec5SDimitry Andric                        X86FoldableSchedWrite sched> {
2720b57cec5SDimitry Andric  def r_Int : FMA3S_Int<opc, MRMSrcReg, (outs RC:$dst),
2730b57cec5SDimitry Andric                        (ins RC:$src1, RC:$src2, RC:$src3),
2740b57cec5SDimitry Andric                        !strconcat(OpcodeStr,
2750b57cec5SDimitry Andric                                   "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
2760b57cec5SDimitry Andric                        []>, Sched<[sched]>;
2770b57cec5SDimitry Andric
2780b57cec5SDimitry Andric  let mayLoad = 1 in
2790b57cec5SDimitry Andric  def m_Int : FMA3S_Int<opc, MRMSrcMem, (outs RC:$dst),
2800b57cec5SDimitry Andric                        (ins RC:$src1, RC:$src2, memopr:$src3),
2810b57cec5SDimitry Andric                        !strconcat(OpcodeStr,
2820b57cec5SDimitry Andric                                   "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
2830b57cec5SDimitry Andric                        []>, Sched<[sched.Folded, sched.ReadAfterFold, sched.ReadAfterFold]>;
2840b57cec5SDimitry Andric}
2850b57cec5SDimitry Andric
2865ffd83dbSDimitry Andric// The FMA 213 form is created for lowering of scalar FMA intrinsics
2870b57cec5SDimitry Andric// to machine instructions.
2880b57cec5SDimitry Andric// The FMA 132 form can trivially be get by commuting the 2nd and 3rd operands
2890b57cec5SDimitry Andric// of FMA 213 form.
2900b57cec5SDimitry Andric// The FMA 231 form can be get only by commuting the 1st operand of 213 or 132
2910b57cec5SDimitry Andric// forms and is possible only after special analysis of all uses of the initial
2920b57cec5SDimitry Andric// instruction. Such analysis do not exist yet and thus introducing the 231
2930b57cec5SDimitry Andric// form of FMA*_Int instructions is done using an optimistic assumption that
2940b57cec5SDimitry Andric// such analysis will be implemented eventually.
2950b57cec5SDimitry Andricmulticlass fma3s_int_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231,
2960b57cec5SDimitry Andric                           string OpStr, string PackTy, string Suff,
2970b57cec5SDimitry Andric                           RegisterClass RC, Operand memop,
2980b57cec5SDimitry Andric                           X86FoldableSchedWrite sched> {
2990b57cec5SDimitry Andric  defm NAME#132#Suff : fma3s_rm_int<opc132, !strconcat(OpStr, "132", PackTy),
3000b57cec5SDimitry Andric                                    memop, RC, sched>;
3010b57cec5SDimitry Andric  defm NAME#213#Suff : fma3s_rm_int<opc213, !strconcat(OpStr, "213", PackTy),
3020b57cec5SDimitry Andric                                    memop, RC, sched>;
3030b57cec5SDimitry Andric  defm NAME#231#Suff : fma3s_rm_int<opc231, !strconcat(OpStr, "231", PackTy),
3040b57cec5SDimitry Andric                                    memop, RC, sched>;
3050b57cec5SDimitry Andric}
3060b57cec5SDimitry Andric
3070b57cec5SDimitry Andricmulticlass fma3s<bits<8> opc132, bits<8> opc213, bits<8> opc231,
308fe6060f1SDimitry Andric                 string OpStr, SDPatternOperator OpNode, X86FoldableSchedWrite sched> {
3090b57cec5SDimitry Andric  let ExeDomain = SSEPackedSingle in
3100b57cec5SDimitry Andric  defm NAME : fma3s_forms<opc132, opc213, opc231, OpStr, "ss", "SS", OpNode,
3110b57cec5SDimitry Andric                          FR32, f32mem, sched>,
3120b57cec5SDimitry Andric              fma3s_int_forms<opc132, opc213, opc231, OpStr, "ss", "SS",
3130b57cec5SDimitry Andric                              VR128, ssmem, sched>;
3140b57cec5SDimitry Andric
3150b57cec5SDimitry Andric  let ExeDomain = SSEPackedDouble in
3160b57cec5SDimitry Andric  defm NAME : fma3s_forms<opc132, opc213, opc231, OpStr, "sd", "SD", OpNode,
3170b57cec5SDimitry Andric                        FR64, f64mem, sched>,
3180b57cec5SDimitry Andric              fma3s_int_forms<opc132, opc213, opc231, OpStr, "sd", "SD",
31906c3fb27SDimitry Andric                              VR128, sdmem, sched>, REX_W;
3200b57cec5SDimitry Andric}
3210b57cec5SDimitry Andric
322e8d8bef9SDimitry Andricdefm VFMADD : fma3s<0x99, 0xA9, 0xB9, "vfmadd", any_fma,
3230b57cec5SDimitry Andric                    SchedWriteFMA.Scl>, VEX_LIG;
3245ffd83dbSDimitry Andricdefm VFMSUB : fma3s<0x9B, 0xAB, 0xBB, "vfmsub", X86any_Fmsub,
3250b57cec5SDimitry Andric                    SchedWriteFMA.Scl>, VEX_LIG;
3260b57cec5SDimitry Andric
3275ffd83dbSDimitry Andricdefm VFNMADD : fma3s<0x9D, 0xAD, 0xBD, "vfnmadd", X86any_Fnmadd,
3280b57cec5SDimitry Andric                     SchedWriteFMA.Scl>, VEX_LIG;
3295ffd83dbSDimitry Andricdefm VFNMSUB : fma3s<0x9F, 0xAF, 0xBF, "vfnmsub", X86any_Fnmsub,
3300b57cec5SDimitry Andric                     SchedWriteFMA.Scl>, VEX_LIG;
3310b57cec5SDimitry Andric
332fe6060f1SDimitry Andricmulticlass scalar_fma_patterns<SDPatternOperator Op, string Prefix, string Suffix,
3330b57cec5SDimitry Andric                               SDNode Move, ValueType VT, ValueType EltVT,
3340b57cec5SDimitry Andric                               RegisterClass RC, PatFrag mem_frag> {
3350b57cec5SDimitry Andric  let Predicates = [HasFMA, NoAVX512] in {
3360b57cec5SDimitry Andric    def : Pat<(VT (Move (VT VR128:$src1), (VT (scalar_to_vector
3370b57cec5SDimitry Andric                (Op RC:$src2,
3380b57cec5SDimitry Andric                    (EltVT (extractelt (VT VR128:$src1), (iPTR 0))),
3390b57cec5SDimitry Andric                    RC:$src3))))),
3400b57cec5SDimitry Andric              (!cast<Instruction>(Prefix#"213"#Suffix#"r_Int")
3410b57cec5SDimitry Andric               VR128:$src1, (VT (COPY_TO_REGCLASS RC:$src2, VR128)),
3420b57cec5SDimitry Andric               (VT (COPY_TO_REGCLASS RC:$src3, VR128)))>;
3430b57cec5SDimitry Andric
3440b57cec5SDimitry Andric    def : Pat<(VT (Move (VT VR128:$src1), (VT (scalar_to_vector
3450b57cec5SDimitry Andric                (Op RC:$src2, RC:$src3,
3460b57cec5SDimitry Andric                    (EltVT (extractelt (VT VR128:$src1), (iPTR 0)))))))),
3470b57cec5SDimitry Andric              (!cast<Instruction>(Prefix#"231"#Suffix#"r_Int")
3480b57cec5SDimitry Andric               VR128:$src1, (VT (COPY_TO_REGCLASS RC:$src2, VR128)),
3490b57cec5SDimitry Andric               (VT (COPY_TO_REGCLASS RC:$src3, VR128)))>;
3500b57cec5SDimitry Andric
3510b57cec5SDimitry Andric    def : Pat<(VT (Move (VT VR128:$src1), (VT (scalar_to_vector
3520b57cec5SDimitry Andric                (Op RC:$src2,
3530b57cec5SDimitry Andric                    (EltVT (extractelt (VT VR128:$src1), (iPTR 0))),
3540b57cec5SDimitry Andric                    (mem_frag addr:$src3)))))),
3550b57cec5SDimitry Andric              (!cast<Instruction>(Prefix#"213"#Suffix#"m_Int")
3560b57cec5SDimitry Andric               VR128:$src1, (VT (COPY_TO_REGCLASS RC:$src2, VR128)),
3570b57cec5SDimitry Andric               addr:$src3)>;
3580b57cec5SDimitry Andric
3590b57cec5SDimitry Andric    def : Pat<(VT (Move (VT VR128:$src1), (VT (scalar_to_vector
3600b57cec5SDimitry Andric                (Op (EltVT (extractelt (VT VR128:$src1), (iPTR 0))),
3610b57cec5SDimitry Andric                    (mem_frag addr:$src3), RC:$src2))))),
3620b57cec5SDimitry Andric              (!cast<Instruction>(Prefix#"132"#Suffix#"m_Int")
3630b57cec5SDimitry Andric               VR128:$src1, (VT (COPY_TO_REGCLASS RC:$src2, VR128)),
3640b57cec5SDimitry Andric               addr:$src3)>;
3650b57cec5SDimitry Andric
3660b57cec5SDimitry Andric    def : Pat<(VT (Move (VT VR128:$src1), (VT (scalar_to_vector
3670b57cec5SDimitry Andric                (Op RC:$src2, (mem_frag addr:$src3),
3680b57cec5SDimitry Andric                    (EltVT (extractelt (VT VR128:$src1), (iPTR 0)))))))),
3690b57cec5SDimitry Andric              (!cast<Instruction>(Prefix#"231"#Suffix#"m_Int")
3700b57cec5SDimitry Andric               VR128:$src1, (VT (COPY_TO_REGCLASS RC:$src2, VR128)),
3710b57cec5SDimitry Andric               addr:$src3)>;
3720b57cec5SDimitry Andric  }
3730b57cec5SDimitry Andric}
3740b57cec5SDimitry Andric
375e8d8bef9SDimitry Andricdefm : scalar_fma_patterns<any_fma, "VFMADD", "SS", X86Movss, v4f32, f32, FR32, loadf32>;
3765ffd83dbSDimitry Andricdefm : scalar_fma_patterns<X86any_Fmsub, "VFMSUB", "SS", X86Movss, v4f32, f32, FR32, loadf32>;
3775ffd83dbSDimitry Andricdefm : scalar_fma_patterns<X86any_Fnmadd, "VFNMADD", "SS", X86Movss, v4f32, f32, FR32, loadf32>;
3785ffd83dbSDimitry Andricdefm : scalar_fma_patterns<X86any_Fnmsub, "VFNMSUB", "SS", X86Movss, v4f32, f32, FR32, loadf32>;
3790b57cec5SDimitry Andric
380e8d8bef9SDimitry Andricdefm : scalar_fma_patterns<any_fma, "VFMADD", "SD", X86Movsd, v2f64, f64, FR64, loadf64>;
3815ffd83dbSDimitry Andricdefm : scalar_fma_patterns<X86any_Fmsub, "VFMSUB", "SD", X86Movsd, v2f64, f64, FR64, loadf64>;
3825ffd83dbSDimitry Andricdefm : scalar_fma_patterns<X86any_Fnmadd, "VFNMADD", "SD", X86Movsd, v2f64, f64, FR64, loadf64>;
3835ffd83dbSDimitry Andricdefm : scalar_fma_patterns<X86any_Fnmsub, "VFNMSUB", "SD", X86Movsd, v2f64, f64, FR64, loadf64>;
3840b57cec5SDimitry Andric
3850b57cec5SDimitry Andric//===----------------------------------------------------------------------===//
3860b57cec5SDimitry Andric// FMA4 - AMD 4 operand Fused Multiply-Add instructions
3870b57cec5SDimitry Andric//===----------------------------------------------------------------------===//
3880b57cec5SDimitry Andric
389480093f4SDimitry Andriclet Uses = [MXCSR], mayRaiseFPException = 1 in
3900b57cec5SDimitry Andricmulticlass fma4s<bits<8> opc, string OpcodeStr, RegisterClass RC,
391fe6060f1SDimitry Andric                 X86MemOperand x86memop, ValueType OpVT, SDPatternOperator OpNode,
3920b57cec5SDimitry Andric                 PatFrag mem_frag, X86FoldableSchedWrite sched> {
3930b57cec5SDimitry Andric  let isCommutable = 1 in
3940b57cec5SDimitry Andric  def rr : FMA4S<opc, MRMSrcRegOp4, (outs RC:$dst),
3950b57cec5SDimitry Andric           (ins RC:$src1, RC:$src2, RC:$src3),
3960b57cec5SDimitry Andric           !strconcat(OpcodeStr,
3970b57cec5SDimitry Andric           "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
3980b57cec5SDimitry Andric           [(set RC:$dst,
39906c3fb27SDimitry Andric             (OpVT (OpNode RC:$src1, RC:$src2, RC:$src3)))]>, REX_W, VEX_LIG,
4000b57cec5SDimitry Andric           Sched<[sched]>;
4010b57cec5SDimitry Andric  def rm : FMA4S<opc, MRMSrcMemOp4, (outs RC:$dst),
4020b57cec5SDimitry Andric           (ins RC:$src1, RC:$src2, x86memop:$src3),
4030b57cec5SDimitry Andric           !strconcat(OpcodeStr,
4040b57cec5SDimitry Andric           "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
4050b57cec5SDimitry Andric           [(set RC:$dst, (OpNode RC:$src1, RC:$src2,
40606c3fb27SDimitry Andric                           (mem_frag addr:$src3)))]>, REX_W, VEX_LIG,
4070b57cec5SDimitry Andric           Sched<[sched.Folded, sched.ReadAfterFold, sched.ReadAfterFold]>;
4080b57cec5SDimitry Andric  def mr : FMA4S<opc, MRMSrcMem, (outs RC:$dst),
4090b57cec5SDimitry Andric           (ins RC:$src1, x86memop:$src2, RC:$src3),
4100b57cec5SDimitry Andric           !strconcat(OpcodeStr,
4110b57cec5SDimitry Andric           "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
4120b57cec5SDimitry Andric           [(set RC:$dst,
4130b57cec5SDimitry Andric             (OpNode RC:$src1, (mem_frag addr:$src2), RC:$src3))]>, VEX_LIG,
4140b57cec5SDimitry Andric           Sched<[sched.Folded, sched.ReadAfterFold,
4150b57cec5SDimitry Andric                  // x86memop:$src2
4160b57cec5SDimitry Andric                  ReadDefault, ReadDefault, ReadDefault, ReadDefault,
4170b57cec5SDimitry Andric                  ReadDefault,
4180b57cec5SDimitry Andric                  // RC:$src3
4190b57cec5SDimitry Andric                  sched.ReadAfterFold]>;
4200b57cec5SDimitry Andric// For disassembler
4210b57cec5SDimitry Andriclet isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
4220b57cec5SDimitry Andric  def rr_REV : FMA4S<opc, MRMSrcReg, (outs RC:$dst),
4230b57cec5SDimitry Andric               (ins RC:$src1, RC:$src2, RC:$src3),
4240b57cec5SDimitry Andric               !strconcat(OpcodeStr,
4250b57cec5SDimitry Andric               "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), []>,
42606c3fb27SDimitry Andric               VEX_LIG, Sched<[sched]>;
4270b57cec5SDimitry Andric}
4280b57cec5SDimitry Andric
4290b57cec5SDimitry Andricmulticlass fma4s_int<bits<8> opc, string OpcodeStr, Operand memop,
430349cc55cSDimitry Andric                     X86FoldableSchedWrite sched> {
431480093f4SDimitry Andriclet isCodeGenOnly = 1, hasSideEffects = 0,
432480093f4SDimitry Andric    Uses = [MXCSR], mayRaiseFPException = 1 in {
4330b57cec5SDimitry Andric  def rr_Int : FMA4S_Int<opc, MRMSrcRegOp4, (outs VR128:$dst),
4340b57cec5SDimitry Andric               (ins VR128:$src1, VR128:$src2, VR128:$src3),
4350b57cec5SDimitry Andric               !strconcat(OpcodeStr,
4360b57cec5SDimitry Andric               "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
43706c3fb27SDimitry Andric               []>, REX_W, VEX_LIG, Sched<[sched]>;
4380b57cec5SDimitry Andric  let mayLoad = 1 in
4390b57cec5SDimitry Andric  def rm_Int : FMA4S_Int<opc, MRMSrcMemOp4, (outs VR128:$dst),
4400b57cec5SDimitry Andric               (ins VR128:$src1, VR128:$src2, memop:$src3),
4410b57cec5SDimitry Andric               !strconcat(OpcodeStr,
4420b57cec5SDimitry Andric               "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
44306c3fb27SDimitry Andric               []>, REX_W, VEX_LIG,
4440b57cec5SDimitry Andric               Sched<[sched.Folded, sched.ReadAfterFold, sched.ReadAfterFold]>;
4450b57cec5SDimitry Andric  let mayLoad = 1 in
4460b57cec5SDimitry Andric  def mr_Int : FMA4S_Int<opc, MRMSrcMem, (outs VR128:$dst),
4470b57cec5SDimitry Andric               (ins VR128:$src1, memop:$src2, VR128:$src3),
4480b57cec5SDimitry Andric               !strconcat(OpcodeStr,
4490b57cec5SDimitry Andric               "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
4500b57cec5SDimitry Andric               []>,
4510b57cec5SDimitry Andric               VEX_LIG, Sched<[sched.Folded, sched.ReadAfterFold,
4520b57cec5SDimitry Andric                               // memop:$src2
4530b57cec5SDimitry Andric                               ReadDefault, ReadDefault, ReadDefault,
4540b57cec5SDimitry Andric                               ReadDefault, ReadDefault,
4550b57cec5SDimitry Andric                               // VR128::$src3
4560b57cec5SDimitry Andric                               sched.ReadAfterFold]>;
4570b57cec5SDimitry Andric  def rr_Int_REV : FMA4S_Int<opc, MRMSrcReg, (outs VR128:$dst),
4580b57cec5SDimitry Andric               (ins VR128:$src1, VR128:$src2, VR128:$src3),
4590b57cec5SDimitry Andric               !strconcat(OpcodeStr,
4600b57cec5SDimitry Andric               "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
46106c3fb27SDimitry Andric               []>, VEX_LIG, Sched<[sched]>;
4620b57cec5SDimitry Andric} // isCodeGenOnly = 1
4630b57cec5SDimitry Andric}
4640b57cec5SDimitry Andric
465480093f4SDimitry Andriclet Uses = [MXCSR], mayRaiseFPException = 1 in
466fe6060f1SDimitry Andricmulticlass fma4p<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
4670b57cec5SDimitry Andric                 ValueType OpVT128, ValueType OpVT256,
4680b57cec5SDimitry Andric                 PatFrag ld_frag128, PatFrag ld_frag256,
4690b57cec5SDimitry Andric                 X86SchedWriteWidths sched> {
4700b57cec5SDimitry Andric  let isCommutable = 1 in
4710b57cec5SDimitry Andric  def rr : FMA4<opc, MRMSrcRegOp4, (outs VR128:$dst),
4720b57cec5SDimitry Andric           (ins VR128:$src1, VR128:$src2, VR128:$src3),
4730b57cec5SDimitry Andric           !strconcat(OpcodeStr,
4740b57cec5SDimitry Andric           "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
4750b57cec5SDimitry Andric           [(set VR128:$dst,
4760b57cec5SDimitry Andric             (OpVT128 (OpNode VR128:$src1, VR128:$src2, VR128:$src3)))]>,
47706c3fb27SDimitry Andric           REX_W, Sched<[sched.XMM]>;
4780b57cec5SDimitry Andric  def rm : FMA4<opc, MRMSrcMemOp4, (outs VR128:$dst),
4790b57cec5SDimitry Andric           (ins VR128:$src1, VR128:$src2, f128mem:$src3),
4800b57cec5SDimitry Andric           !strconcat(OpcodeStr,
4810b57cec5SDimitry Andric           "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
4820b57cec5SDimitry Andric           [(set VR128:$dst, (OpNode VR128:$src1, VR128:$src2,
48306c3fb27SDimitry Andric                              (ld_frag128 addr:$src3)))]>, REX_W,
4840b57cec5SDimitry Andric           Sched<[sched.XMM.Folded, sched.XMM.ReadAfterFold, sched.XMM.ReadAfterFold]>;
4850b57cec5SDimitry Andric  def mr : FMA4<opc, MRMSrcMem, (outs VR128:$dst),
4860b57cec5SDimitry Andric           (ins VR128:$src1, f128mem:$src2, VR128:$src3),
4870b57cec5SDimitry Andric           !strconcat(OpcodeStr,
4880b57cec5SDimitry Andric           "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
4890b57cec5SDimitry Andric           [(set VR128:$dst,
4900b57cec5SDimitry Andric             (OpNode VR128:$src1, (ld_frag128 addr:$src2), VR128:$src3))]>,
4910b57cec5SDimitry Andric           Sched<[sched.XMM.Folded, sched.XMM.ReadAfterFold,
4920b57cec5SDimitry Andric                  // f128mem:$src2
4930b57cec5SDimitry Andric                  ReadDefault, ReadDefault, ReadDefault, ReadDefault,
4940b57cec5SDimitry Andric                  ReadDefault,
4950b57cec5SDimitry Andric                  // VR128::$src3
4960b57cec5SDimitry Andric                  sched.XMM.ReadAfterFold]>;
4970b57cec5SDimitry Andric  let isCommutable = 1 in
4980b57cec5SDimitry Andric  def Yrr : FMA4<opc, MRMSrcRegOp4, (outs VR256:$dst),
4990b57cec5SDimitry Andric           (ins VR256:$src1, VR256:$src2, VR256:$src3),
5000b57cec5SDimitry Andric           !strconcat(OpcodeStr,
5010b57cec5SDimitry Andric           "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
5020b57cec5SDimitry Andric           [(set VR256:$dst,
5030b57cec5SDimitry Andric             (OpVT256 (OpNode VR256:$src1, VR256:$src2, VR256:$src3)))]>,
50406c3fb27SDimitry Andric           REX_W, VEX_L, Sched<[sched.YMM]>;
5050b57cec5SDimitry Andric  def Yrm : FMA4<opc, MRMSrcMemOp4, (outs VR256:$dst),
5060b57cec5SDimitry Andric           (ins VR256:$src1, VR256:$src2, f256mem:$src3),
5070b57cec5SDimitry Andric           !strconcat(OpcodeStr,
5080b57cec5SDimitry Andric           "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
5090b57cec5SDimitry Andric           [(set VR256:$dst, (OpNode VR256:$src1, VR256:$src2,
51006c3fb27SDimitry Andric                              (ld_frag256 addr:$src3)))]>, REX_W, VEX_L,
5110b57cec5SDimitry Andric           Sched<[sched.YMM.Folded, sched.YMM.ReadAfterFold, sched.YMM.ReadAfterFold]>;
5120b57cec5SDimitry Andric  def Ymr : FMA4<opc, MRMSrcMem, (outs VR256:$dst),
5130b57cec5SDimitry Andric           (ins VR256:$src1, f256mem:$src2, VR256:$src3),
5140b57cec5SDimitry Andric           !strconcat(OpcodeStr,
5150b57cec5SDimitry Andric           "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
5160b57cec5SDimitry Andric           [(set VR256:$dst, (OpNode VR256:$src1,
5170b57cec5SDimitry Andric                              (ld_frag256 addr:$src2), VR256:$src3))]>, VEX_L,
5180b57cec5SDimitry Andric           Sched<[sched.YMM.Folded, sched.YMM.ReadAfterFold,
5190b57cec5SDimitry Andric                  // f256mem:$src2
5200b57cec5SDimitry Andric                  ReadDefault, ReadDefault, ReadDefault, ReadDefault,
5210b57cec5SDimitry Andric                  ReadDefault,
5220b57cec5SDimitry Andric                  // VR256::$src3
5230b57cec5SDimitry Andric                  sched.YMM.ReadAfterFold]>;
5240b57cec5SDimitry Andric// For disassembler
5250b57cec5SDimitry Andriclet isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in {
5260b57cec5SDimitry Andric  def rr_REV : FMA4<opc, MRMSrcReg, (outs VR128:$dst),
5270b57cec5SDimitry Andric               (ins VR128:$src1, VR128:$src2, VR128:$src3),
5280b57cec5SDimitry Andric               !strconcat(OpcodeStr,
5290b57cec5SDimitry Andric               "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), []>,
53006c3fb27SDimitry Andric               Sched<[sched.XMM]>;
5310b57cec5SDimitry Andric  def Yrr_REV : FMA4<opc, MRMSrcReg, (outs VR256:$dst),
5320b57cec5SDimitry Andric                (ins VR256:$src1, VR256:$src2, VR256:$src3),
5330b57cec5SDimitry Andric                !strconcat(OpcodeStr,
5340b57cec5SDimitry Andric                "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), []>,
53506c3fb27SDimitry Andric                VEX_L, Sched<[sched.YMM]>;
5360b57cec5SDimitry Andric} // isCodeGenOnly = 1
5370b57cec5SDimitry Andric}
5380b57cec5SDimitry Andric
5390b57cec5SDimitry Andriclet ExeDomain = SSEPackedSingle in {
5400b57cec5SDimitry Andric  // Scalar Instructions
541e8d8bef9SDimitry Andric  defm VFMADDSS4  : fma4s<0x6A, "vfmaddss", FR32, f32mem, f32, any_fma, loadf32,
5420b57cec5SDimitry Andric                          SchedWriteFMA.Scl>,
543349cc55cSDimitry Andric                    fma4s_int<0x6A, "vfmaddss", ssmem, SchedWriteFMA.Scl>;
5445ffd83dbSDimitry Andric  defm VFMSUBSS4  : fma4s<0x6E, "vfmsubss", FR32, f32mem, f32, X86any_Fmsub, loadf32,
5450b57cec5SDimitry Andric                          SchedWriteFMA.Scl>,
546349cc55cSDimitry Andric                    fma4s_int<0x6E, "vfmsubss", ssmem, SchedWriteFMA.Scl>;
5470b57cec5SDimitry Andric  defm VFNMADDSS4 : fma4s<0x7A, "vfnmaddss", FR32, f32mem, f32,
5485ffd83dbSDimitry Andric                          X86any_Fnmadd, loadf32, SchedWriteFMA.Scl>,
549349cc55cSDimitry Andric                    fma4s_int<0x7A, "vfnmaddss", ssmem, SchedWriteFMA.Scl>;
5500b57cec5SDimitry Andric  defm VFNMSUBSS4 : fma4s<0x7E, "vfnmsubss", FR32, f32mem, f32,
5515ffd83dbSDimitry Andric                          X86any_Fnmsub, loadf32, SchedWriteFMA.Scl>,
552349cc55cSDimitry Andric                    fma4s_int<0x7E, "vfnmsubss", ssmem, SchedWriteFMA.Scl>;
5530b57cec5SDimitry Andric  // Packed Instructions
554e8d8bef9SDimitry Andric  defm VFMADDPS4    : fma4p<0x68, "vfmaddps", any_fma, v4f32, v8f32,
5550b57cec5SDimitry Andric                            loadv4f32, loadv8f32, SchedWriteFMA>;
5565ffd83dbSDimitry Andric  defm VFMSUBPS4    : fma4p<0x6C, "vfmsubps", X86any_Fmsub, v4f32, v8f32,
5570b57cec5SDimitry Andric                            loadv4f32, loadv8f32, SchedWriteFMA>;
5585ffd83dbSDimitry Andric  defm VFNMADDPS4   : fma4p<0x78, "vfnmaddps", X86any_Fnmadd, v4f32, v8f32,
5590b57cec5SDimitry Andric                            loadv4f32, loadv8f32, SchedWriteFMA>;
5605ffd83dbSDimitry Andric  defm VFNMSUBPS4   : fma4p<0x7C, "vfnmsubps", X86any_Fnmsub, v4f32, v8f32,
5610b57cec5SDimitry Andric                            loadv4f32, loadv8f32, SchedWriteFMA>;
5620b57cec5SDimitry Andric  defm VFMADDSUBPS4 : fma4p<0x5C, "vfmaddsubps", X86Fmaddsub, v4f32, v8f32,
5630b57cec5SDimitry Andric                            loadv4f32, loadv8f32, SchedWriteFMA>;
5640b57cec5SDimitry Andric  defm VFMSUBADDPS4 : fma4p<0x5E, "vfmsubaddps", X86Fmsubadd, v4f32, v8f32,
5650b57cec5SDimitry Andric                            loadv4f32, loadv8f32, SchedWriteFMA>;
5660b57cec5SDimitry Andric}
5670b57cec5SDimitry Andric
5680b57cec5SDimitry Andriclet ExeDomain = SSEPackedDouble in {
5690b57cec5SDimitry Andric  // Scalar Instructions
570e8d8bef9SDimitry Andric  defm VFMADDSD4  : fma4s<0x6B, "vfmaddsd", FR64, f64mem, f64, any_fma, loadf64,
5710b57cec5SDimitry Andric                          SchedWriteFMA.Scl>,
572349cc55cSDimitry Andric                    fma4s_int<0x6B, "vfmaddsd", sdmem, SchedWriteFMA.Scl>;
5735ffd83dbSDimitry Andric  defm VFMSUBSD4  : fma4s<0x6F, "vfmsubsd", FR64, f64mem, f64, X86any_Fmsub, loadf64,
5740b57cec5SDimitry Andric                          SchedWriteFMA.Scl>,
575349cc55cSDimitry Andric                    fma4s_int<0x6F, "vfmsubsd", sdmem, SchedWriteFMA.Scl>;
5760b57cec5SDimitry Andric  defm VFNMADDSD4 : fma4s<0x7B, "vfnmaddsd", FR64, f64mem, f64,
5775ffd83dbSDimitry Andric                          X86any_Fnmadd, loadf64, SchedWriteFMA.Scl>,
578349cc55cSDimitry Andric                    fma4s_int<0x7B, "vfnmaddsd", sdmem, SchedWriteFMA.Scl>;
5790b57cec5SDimitry Andric  defm VFNMSUBSD4 : fma4s<0x7F, "vfnmsubsd", FR64, f64mem, f64,
5805ffd83dbSDimitry Andric                          X86any_Fnmsub, loadf64, SchedWriteFMA.Scl>,
581349cc55cSDimitry Andric                    fma4s_int<0x7F, "vfnmsubsd", sdmem, SchedWriteFMA.Scl>;
5820b57cec5SDimitry Andric  // Packed Instructions
583e8d8bef9SDimitry Andric  defm VFMADDPD4    : fma4p<0x69, "vfmaddpd", any_fma, v2f64, v4f64,
5840b57cec5SDimitry Andric                            loadv2f64, loadv4f64, SchedWriteFMA>;
5855ffd83dbSDimitry Andric  defm VFMSUBPD4    : fma4p<0x6D, "vfmsubpd", X86any_Fmsub, v2f64, v4f64,
5860b57cec5SDimitry Andric                            loadv2f64, loadv4f64, SchedWriteFMA>;
5875ffd83dbSDimitry Andric  defm VFNMADDPD4   : fma4p<0x79, "vfnmaddpd", X86any_Fnmadd, v2f64, v4f64,
5880b57cec5SDimitry Andric                            loadv2f64, loadv4f64, SchedWriteFMA>;
5895ffd83dbSDimitry Andric  defm VFNMSUBPD4   : fma4p<0x7D, "vfnmsubpd", X86any_Fnmsub, v2f64, v4f64,
5900b57cec5SDimitry Andric                            loadv2f64, loadv4f64, SchedWriteFMA>;
5910b57cec5SDimitry Andric  defm VFMADDSUBPD4 : fma4p<0x5D, "vfmaddsubpd", X86Fmaddsub, v2f64, v4f64,
5920b57cec5SDimitry Andric                            loadv2f64, loadv4f64, SchedWriteFMA>;
5930b57cec5SDimitry Andric  defm VFMSUBADDPD4 : fma4p<0x5F, "vfmsubaddpd", X86Fmsubadd, v2f64, v4f64,
5940b57cec5SDimitry Andric                            loadv2f64, loadv4f64, SchedWriteFMA>;
5950b57cec5SDimitry Andric}
5960b57cec5SDimitry Andric
597fe6060f1SDimitry Andricmulticlass scalar_fma4_patterns<SDPatternOperator Op, string Name,
598349cc55cSDimitry Andric                                ValueType VT, RegisterClass RC,
599349cc55cSDimitry Andric                                PatFrag mem_frag> {
6000b57cec5SDimitry Andric  let Predicates = [HasFMA4] in {
6010b57cec5SDimitry Andric    def : Pat<(VT (X86vzmovl (VT (scalar_to_vector
6020b57cec5SDimitry Andric                                  (Op RC:$src1, RC:$src2, RC:$src3))))),
6030b57cec5SDimitry Andric              (!cast<Instruction>(Name#"rr_Int")
6040b57cec5SDimitry Andric               (VT (COPY_TO_REGCLASS RC:$src1, VR128)),
6050b57cec5SDimitry Andric               (VT (COPY_TO_REGCLASS RC:$src2, VR128)),
6060b57cec5SDimitry Andric               (VT (COPY_TO_REGCLASS RC:$src3, VR128)))>;
6070b57cec5SDimitry Andric
6080b57cec5SDimitry Andric    def : Pat<(VT (X86vzmovl (VT (scalar_to_vector
6090b57cec5SDimitry Andric                                  (Op RC:$src1, RC:$src2,
6100b57cec5SDimitry Andric                                      (mem_frag addr:$src3)))))),
6110b57cec5SDimitry Andric              (!cast<Instruction>(Name#"rm_Int")
6120b57cec5SDimitry Andric               (VT (COPY_TO_REGCLASS RC:$src1, VR128)),
6130b57cec5SDimitry Andric               (VT (COPY_TO_REGCLASS RC:$src2, VR128)), addr:$src3)>;
6140b57cec5SDimitry Andric
6150b57cec5SDimitry Andric    def : Pat<(VT (X86vzmovl (VT (scalar_to_vector
6160b57cec5SDimitry Andric                                  (Op RC:$src1, (mem_frag addr:$src2),
6170b57cec5SDimitry Andric                                      RC:$src3))))),
6180b57cec5SDimitry Andric              (!cast<Instruction>(Name#"mr_Int")
6190b57cec5SDimitry Andric               (VT (COPY_TO_REGCLASS RC:$src1, VR128)), addr:$src2,
6200b57cec5SDimitry Andric               (VT (COPY_TO_REGCLASS RC:$src3, VR128)))>;
6210b57cec5SDimitry Andric  }
6220b57cec5SDimitry Andric}
6230b57cec5SDimitry Andric
624349cc55cSDimitry Andricdefm : scalar_fma4_patterns<any_fma, "VFMADDSS4", v4f32, FR32, loadf32>;
625349cc55cSDimitry Andricdefm : scalar_fma4_patterns<X86any_Fmsub, "VFMSUBSS4", v4f32, FR32, loadf32>;
626349cc55cSDimitry Andricdefm : scalar_fma4_patterns<X86any_Fnmadd, "VFNMADDSS4", v4f32, FR32, loadf32>;
627349cc55cSDimitry Andricdefm : scalar_fma4_patterns<X86any_Fnmsub, "VFNMSUBSS4", v4f32, FR32, loadf32>;
6280b57cec5SDimitry Andric
629349cc55cSDimitry Andricdefm : scalar_fma4_patterns<any_fma, "VFMADDSD4", v2f64, FR64, loadf64>;
630349cc55cSDimitry Andricdefm : scalar_fma4_patterns<X86any_Fmsub, "VFMSUBSD4", v2f64, FR64, loadf64>;
631349cc55cSDimitry Andricdefm : scalar_fma4_patterns<X86any_Fnmadd, "VFNMADDSD4", v2f64, FR64, loadf64>;
632349cc55cSDimitry Andricdefm : scalar_fma4_patterns<X86any_Fnmsub, "VFNMSUBSD4", v2f64, FR64, loadf64>;
633