106f32e7eSjoerg//=- X86ScheduleBtVer2.td - X86 BtVer2 (Jaguar) Scheduling ---*- tablegen -*-=//
206f32e7eSjoerg//
306f32e7eSjoerg// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
406f32e7eSjoerg// See https://llvm.org/LICENSE.txt for license information.
506f32e7eSjoerg// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
606f32e7eSjoerg//
706f32e7eSjoerg//===----------------------------------------------------------------------===//
806f32e7eSjoerg//
906f32e7eSjoerg// This file defines the machine model for AMD btver2 (Jaguar) to support
1006f32e7eSjoerg// instruction scheduling and other instruction cost heuristics. Based off AMD Software
1106f32e7eSjoerg// Optimization Guide for AMD Family 16h Processors & Instruction Latency appendix.
1206f32e7eSjoerg//
1306f32e7eSjoerg//===----------------------------------------------------------------------===//
1406f32e7eSjoerg
1506f32e7eSjoergdef BtVer2Model : SchedMachineModel {
1606f32e7eSjoerg  // All x86 instructions are modeled as a single micro-op, and btver2 can
1706f32e7eSjoerg  // decode 2 instructions per cycle.
1806f32e7eSjoerg  let IssueWidth = 2;
1906f32e7eSjoerg  let MicroOpBufferSize = 64; // Retire Control Unit
2006f32e7eSjoerg  let LoadLatency = 5; // FPU latency (worse case cf Integer 3 cycle latency)
2106f32e7eSjoerg  let HighLatency = 25;
2206f32e7eSjoerg  let MispredictPenalty = 14; // Minimum branch misdirection penalty
2306f32e7eSjoerg  let PostRAScheduler = 1;
2406f32e7eSjoerg
2506f32e7eSjoerg  // FIXME: SSE4/AVX is unimplemented. This flag is set to allow
2606f32e7eSjoerg  // the scheduler to assign a default model to unrecognized opcodes.
2706f32e7eSjoerg  let CompleteModel = 0;
2806f32e7eSjoerg}
2906f32e7eSjoerg
3006f32e7eSjoerglet SchedModel = BtVer2Model in {
3106f32e7eSjoerg
3206f32e7eSjoerg// Jaguar can issue up to 6 micro-ops in one cycle
3306f32e7eSjoergdef JALU0 : ProcResource<1>; // Integer Pipe0: integer ALU0 (also handle FP->INT jam)
3406f32e7eSjoergdef JALU1 : ProcResource<1>; // Integer Pipe1: integer ALU1/MUL/DIV
3506f32e7eSjoergdef JLAGU : ProcResource<1>; // Integer Pipe2: LAGU
3606f32e7eSjoergdef JSAGU : ProcResource<1>; // Integer Pipe3: SAGU (also handles 3-operand LEA)
3706f32e7eSjoergdef JFPU0 : ProcResource<1>; // Vector/FPU Pipe0: VALU0/VIMUL/FPA
3806f32e7eSjoergdef JFPU1 : ProcResource<1>; // Vector/FPU Pipe1: VALU1/STC/FPM
3906f32e7eSjoerg
4006f32e7eSjoerg// The Integer PRF for Jaguar is 64 entries, and it holds the architectural and
4106f32e7eSjoerg// speculative version of the 64-bit integer registers.
4206f32e7eSjoerg// Reference: www.realworldtech.com/jaguar/4/
4306f32e7eSjoerg//
4406f32e7eSjoerg// The processor always keeps the different parts of an integer register
4506f32e7eSjoerg// together. An instruction that writes to a part of a register will therefore
4606f32e7eSjoerg// have a false dependence on any previous write to the same register or any
4706f32e7eSjoerg// part of it.
4806f32e7eSjoerg// Reference: Section 21.10 "AMD Bobcat and Jaguar pipeline: Partial register
4906f32e7eSjoerg// access" - Agner Fog's "microarchitecture.pdf".
5006f32e7eSjoergdef JIntegerPRF : RegisterFile<64, [GR64, CCR], [1, 1], [1, 0],
5106f32e7eSjoerg                               0,  // Max moves that can be eliminated per cycle.
5206f32e7eSjoerg                               1>; // Restrict move elimination to zero regs.
5306f32e7eSjoerg
5406f32e7eSjoerg// The Jaguar FP Retire Queue renames SIMD and FP uOps onto a pool of 72 SSE
5506f32e7eSjoerg// registers. Operations on 256-bit data types are cracked into two COPs.
5606f32e7eSjoerg// Reference: www.realworldtech.com/jaguar/4/
5706f32e7eSjoerg
5806f32e7eSjoerg// The PRF in the floating point unit can eliminate a move from a MMX or SSE
5906f32e7eSjoerg// register that is know to be zero (i.e. it has been zeroed using a zero-idiom
6006f32e7eSjoerg// dependency breaking instruction, or via VZEROALL).
6106f32e7eSjoerg// Reference: Section 21.8 "AMD Bobcat and Jaguar pipeline: Dependency-breaking
6206f32e7eSjoerg// instructions" - Agner Fog's "microarchitecture.pdf"
6306f32e7eSjoergdef JFpuPRF: RegisterFile<72, [VR64, VR128, VR256], [1, 1, 2], [1, 1, 0],
6406f32e7eSjoerg                          0,  // Max moves that can be eliminated per cycle.
6506f32e7eSjoerg                          1>; // Restrict move elimination to zero regs.
6606f32e7eSjoerg
6706f32e7eSjoerg// The retire control unit (RCU) can track up to 64 macro-ops in-flight. It can
6806f32e7eSjoerg// retire up to two macro-ops per cycle.
6906f32e7eSjoerg// Reference: "Software Optimization Guide for AMD Family 16h Processors"
7006f32e7eSjoergdef JRCU : RetireControlUnit<64, 2>;
7106f32e7eSjoerg
7206f32e7eSjoerg// Integer Pipe Scheduler
7306f32e7eSjoergdef JALU01 : ProcResGroup<[JALU0, JALU1]> {
7406f32e7eSjoerg  let BufferSize=20;
7506f32e7eSjoerg}
7606f32e7eSjoerg
7706f32e7eSjoerg// AGU Pipe Scheduler
7806f32e7eSjoergdef JLSAGU : ProcResGroup<[JLAGU, JSAGU]> {
7906f32e7eSjoerg  let BufferSize=12;
8006f32e7eSjoerg}
8106f32e7eSjoerg
8206f32e7eSjoerg// Fpu Pipe Scheduler
8306f32e7eSjoergdef JFPU01 : ProcResGroup<[JFPU0, JFPU1]> {
8406f32e7eSjoerg  let BufferSize=18;
8506f32e7eSjoerg}
8606f32e7eSjoerg
8706f32e7eSjoerg// Functional units
8806f32e7eSjoergdef JDiv    : ProcResource<1>; // integer division
8906f32e7eSjoergdef JMul    : ProcResource<1>; // integer multiplication
9006f32e7eSjoergdef JVALU0  : ProcResource<1>; // vector integer
9106f32e7eSjoergdef JVALU1  : ProcResource<1>; // vector integer
9206f32e7eSjoergdef JVIMUL  : ProcResource<1>; // vector integer multiplication
9306f32e7eSjoergdef JSTC    : ProcResource<1>; // vector store/convert
9406f32e7eSjoergdef JFPM    : ProcResource<1>; // FP multiplication
9506f32e7eSjoergdef JFPA    : ProcResource<1>; // FP addition
9606f32e7eSjoerg
9706f32e7eSjoerg// Functional unit groups
9806f32e7eSjoergdef JFPX  : ProcResGroup<[JFPA, JFPM]>;
9906f32e7eSjoergdef JVALU : ProcResGroup<[JVALU0, JVALU1]>;
10006f32e7eSjoerg
10106f32e7eSjoerg// Integer loads are 3 cycles, so ReadAfterLd registers needn't be available until 3
10206f32e7eSjoerg// cycles after the memory operand.
10306f32e7eSjoergdef : ReadAdvance<ReadAfterLd, 3>;
10406f32e7eSjoerg
10506f32e7eSjoerg// Vector loads are 5 cycles, so ReadAfterVec*Ld registers needn't be available until 5
10606f32e7eSjoerg// cycles after the memory operand.
10706f32e7eSjoergdef : ReadAdvance<ReadAfterVecLd, 5>;
10806f32e7eSjoergdef : ReadAdvance<ReadAfterVecXLd, 5>;
10906f32e7eSjoergdef : ReadAdvance<ReadAfterVecYLd, 5>;
11006f32e7eSjoerg
11106f32e7eSjoerg/// "Additional 6 cycle transfer operation which moves a floating point
11206f32e7eSjoerg/// operation input value from the integer unit to the floating point unit.
11306f32e7eSjoerg/// Reference: AMDfam16h SOG (Appendix A "Instruction Latencies", Section A.2).
11406f32e7eSjoergdef : ReadAdvance<ReadInt2Fpu, -6>;
11506f32e7eSjoerg
11606f32e7eSjoerg// Many SchedWrites are defined in pairs with and without a folded load.
11706f32e7eSjoerg// Instructions with folded loads are usually micro-fused, so they only appear
11806f32e7eSjoerg// as two micro-ops when dispatched by the schedulers.
11906f32e7eSjoerg// This multiclass defines the resource usage for variants with and without
12006f32e7eSjoerg// folded loads.
12106f32e7eSjoergmulticlass JWriteResIntPair<X86FoldableSchedWrite SchedRW,
12206f32e7eSjoerg                            list<ProcResourceKind> ExePorts,
12306f32e7eSjoerg                            int Lat, list<int> Res = [], int UOps = 1,
12406f32e7eSjoerg                            int LoadUOps = 0> {
12506f32e7eSjoerg  // Register variant is using a single cycle on ExePort.
12606f32e7eSjoerg  def : WriteRes<SchedRW, ExePorts> {
12706f32e7eSjoerg    let Latency = Lat;
12806f32e7eSjoerg    let ResourceCycles = Res;
12906f32e7eSjoerg    let NumMicroOps = UOps;
13006f32e7eSjoerg  }
13106f32e7eSjoerg
13206f32e7eSjoerg  // Memory variant also uses a cycle on JLAGU and adds 3 cycles to the
13306f32e7eSjoerg  // latency.
13406f32e7eSjoerg  def : WriteRes<SchedRW.Folded, !listconcat([JLAGU], ExePorts)> {
13506f32e7eSjoerg    let Latency = !add(Lat, 3);
13606f32e7eSjoerg    let ResourceCycles = !if(!empty(Res), [], !listconcat([1], Res));
13706f32e7eSjoerg    let NumMicroOps = !add(UOps, LoadUOps);
13806f32e7eSjoerg  }
13906f32e7eSjoerg}
14006f32e7eSjoerg
14106f32e7eSjoergmulticlass JWriteResFpuPair<X86FoldableSchedWrite SchedRW,
14206f32e7eSjoerg                            list<ProcResourceKind> ExePorts,
14306f32e7eSjoerg                            int Lat, list<int> Res = [], int UOps = 1,
14406f32e7eSjoerg                            int LoadUOps = 0> {
14506f32e7eSjoerg  // Register variant is using a single cycle on ExePort.
14606f32e7eSjoerg  def : WriteRes<SchedRW, ExePorts> {
14706f32e7eSjoerg    let Latency = Lat;
14806f32e7eSjoerg    let ResourceCycles = Res;
14906f32e7eSjoerg    let NumMicroOps = UOps;
15006f32e7eSjoerg  }
15106f32e7eSjoerg
15206f32e7eSjoerg  // Memory variant also uses a cycle on JLAGU and adds 5 cycles to the
15306f32e7eSjoerg  // latency.
15406f32e7eSjoerg  def : WriteRes<SchedRW.Folded, !listconcat([JLAGU], ExePorts)> {
15506f32e7eSjoerg    let Latency = !add(Lat, 5);
15606f32e7eSjoerg    let ResourceCycles = !if(!empty(Res), [], !listconcat([1], Res));
15706f32e7eSjoerg    let NumMicroOps = !add(UOps, LoadUOps);
15806f32e7eSjoerg  }
15906f32e7eSjoerg}
16006f32e7eSjoerg
16106f32e7eSjoergmulticlass JWriteResYMMPair<X86FoldableSchedWrite SchedRW,
16206f32e7eSjoerg                            list<ProcResourceKind> ExePorts,
16306f32e7eSjoerg                            int Lat, list<int> Res = [2], int UOps = 2,
16406f32e7eSjoerg                            int LoadUOps = 0> {
16506f32e7eSjoerg  // Register variant is using a single cycle on ExePort.
16606f32e7eSjoerg  def : WriteRes<SchedRW, ExePorts> {
16706f32e7eSjoerg    let Latency = Lat;
16806f32e7eSjoerg    let ResourceCycles = Res;
16906f32e7eSjoerg    let NumMicroOps = UOps;
17006f32e7eSjoerg  }
17106f32e7eSjoerg
17206f32e7eSjoerg  // Memory variant also uses 2 cycles on JLAGU and adds 5 cycles to the
17306f32e7eSjoerg  // latency.
17406f32e7eSjoerg  def : WriteRes<SchedRW.Folded, !listconcat([JLAGU], ExePorts)> {
17506f32e7eSjoerg    let Latency = !add(Lat, 5);
17606f32e7eSjoerg    let ResourceCycles = !listconcat([2], Res);
17706f32e7eSjoerg    let NumMicroOps = !add(UOps, LoadUOps);
17806f32e7eSjoerg  }
17906f32e7eSjoerg}
18006f32e7eSjoerg
18106f32e7eSjoerg// Instructions that have local forwarding disabled have an extra +1cy latency.
18206f32e7eSjoerg
18306f32e7eSjoerg// A folded store needs a cycle on the SAGU for the store data, most RMW
18406f32e7eSjoerg// instructions don't need an extra uop.  ALU RMW operations don't seem to
18506f32e7eSjoerg// benefit from STLF, and their observed latency is 6cy. That is the reason why
18606f32e7eSjoerg// this write adds two extra cycles (instead of just 1cy for the store).
18706f32e7eSjoergdefm : X86WriteRes<WriteRMW, [JSAGU], 2, [1], 0>;
18806f32e7eSjoerg
18906f32e7eSjoerg////////////////////////////////////////////////////////////////////////////////
19006f32e7eSjoerg// Arithmetic.
19106f32e7eSjoerg////////////////////////////////////////////////////////////////////////////////
19206f32e7eSjoerg
19306f32e7eSjoergdefm : JWriteResIntPair<WriteALU,    [JALU01], 1>;
19406f32e7eSjoergdefm : JWriteResIntPair<WriteADC,    [JALU01], 1, [2]>;
19506f32e7eSjoerg
19606f32e7eSjoergdefm : X86WriteRes<WriteBSWAP32,     [JALU01], 1, [1], 1>;
19706f32e7eSjoergdefm : X86WriteRes<WriteBSWAP64,     [JALU01], 1, [1], 1>;
19806f32e7eSjoergdefm : X86WriteRes<WriteCMPXCHG,     [JALU01], 3, [3], 5>;
19906f32e7eSjoergdefm : X86WriteRes<WriteCMPXCHGRMW,  [JALU01, JSAGU, JLAGU], 11, [3, 1, 1], 6>;
20006f32e7eSjoergdefm : X86WriteRes<WriteXCHG,        [JALU01], 1, [2], 2>;
20106f32e7eSjoerg
20206f32e7eSjoergdefm : JWriteResIntPair<WriteIMul8,     [JALU1, JMul], 3, [1, 1], 1>;
20306f32e7eSjoergdefm : JWriteResIntPair<WriteIMul16,    [JALU1, JMul], 3, [1, 3], 3>;
20406f32e7eSjoergdefm : JWriteResIntPair<WriteIMul16Imm, [JALU1, JMul], 4, [1, 2], 2>;
20506f32e7eSjoergdefm : JWriteResIntPair<WriteIMul16Reg, [JALU1, JMul], 3, [1, 1], 1>;
20606f32e7eSjoergdefm : JWriteResIntPair<WriteIMul32,    [JALU1, JMul], 3, [1, 2], 2>;
20706f32e7eSjoergdefm : JWriteResIntPair<WriteIMul32Imm, [JALU1, JMul], 3, [1, 1], 1>;
20806f32e7eSjoergdefm : JWriteResIntPair<WriteIMul32Reg, [JALU1, JMul], 3, [1, 1], 1>;
20906f32e7eSjoergdefm : JWriteResIntPair<WriteIMul64,    [JALU1, JMul], 6, [1, 4], 2>;
21006f32e7eSjoergdefm : JWriteResIntPair<WriteIMul64Imm, [JALU1, JMul], 6, [1, 4], 1>;
21106f32e7eSjoergdefm : JWriteResIntPair<WriteIMul64Reg, [JALU1, JMul], 6, [1, 4], 1>;
21206f32e7eSjoergdefm : X86WriteRes<WriteIMulH,          [JALU1], 6, [4], 1>;
21306f32e7eSjoerg
21406f32e7eSjoergdefm : JWriteResIntPair<WriteDiv8,   [JALU1, JDiv], 12, [1, 12], 1>;
21506f32e7eSjoergdefm : JWriteResIntPair<WriteDiv16,  [JALU1, JDiv], 17, [1, 17], 2>;
21606f32e7eSjoergdefm : JWriteResIntPair<WriteDiv32,  [JALU1, JDiv], 25, [1, 25], 2>;
21706f32e7eSjoergdefm : JWriteResIntPair<WriteDiv64,  [JALU1, JDiv], 41, [1, 41], 2>;
21806f32e7eSjoergdefm : JWriteResIntPair<WriteIDiv8,  [JALU1, JDiv], 12, [1, 12], 1>;
21906f32e7eSjoergdefm : JWriteResIntPair<WriteIDiv16, [JALU1, JDiv], 17, [1, 17], 2>;
22006f32e7eSjoergdefm : JWriteResIntPair<WriteIDiv32, [JALU1, JDiv], 25, [1, 25], 2>;
22106f32e7eSjoergdefm : JWriteResIntPair<WriteIDiv64, [JALU1, JDiv], 41, [1, 41], 2>;
22206f32e7eSjoerg
22306f32e7eSjoergdefm : JWriteResIntPair<WriteCRC32,  [JALU01], 3, [4], 3>;
22406f32e7eSjoerg
22506f32e7eSjoergdefm : JWriteResIntPair<WriteCMOV,  [JALU01], 1>; // Conditional move.
22606f32e7eSjoergdefm : X86WriteRes<WriteFCMOV, [JFPU0, JFPA], 3, [1,1], 1>; // x87 conditional move.
22706f32e7eSjoergdef  : WriteRes<WriteSETCC, [JALU01]>; // Setcc.
22806f32e7eSjoergdef  : WriteRes<WriteSETCCStore, [JALU01,JSAGU]>;
22906f32e7eSjoergdef  : WriteRes<WriteLAHFSAHF, [JALU01]>;
23006f32e7eSjoerg
23106f32e7eSjoergdefm : X86WriteRes<WriteBitTest,         [JALU01], 1, [1], 1>;
23206f32e7eSjoergdefm : X86WriteRes<WriteBitTestImmLd,    [JALU01,JLAGU], 4, [1,1], 1>;
23306f32e7eSjoergdefm : X86WriteRes<WriteBitTestRegLd,    [JALU01,JLAGU], 4, [1,1], 5>;
23406f32e7eSjoergdefm : X86WriteRes<WriteBitTestSet,      [JALU01], 1, [1], 2>;
23506f32e7eSjoergdefm : X86WriteRes<WriteBitTestSetImmLd, [JALU01,JLAGU], 4, [1,1], 4>;
23606f32e7eSjoergdefm : X86WriteRes<WriteBitTestSetRegLd, [JALU01,JLAGU], 4, [1,1], 8>;
23706f32e7eSjoerg
23806f32e7eSjoerg// This is for simple LEAs with one or two input operands.
23906f32e7eSjoergdef : WriteRes<WriteLEA, [JALU01]>;
24006f32e7eSjoerg
24106f32e7eSjoerg// Bit counts.
24206f32e7eSjoergdefm : JWriteResIntPair<WriteBSF, [JALU01], 4, [8], 7>;
24306f32e7eSjoergdefm : JWriteResIntPair<WriteBSR, [JALU01], 5, [8], 8>;
24406f32e7eSjoergdefm : JWriteResIntPair<WritePOPCNT,         [JALU01], 1>;
24506f32e7eSjoergdefm : JWriteResIntPair<WriteLZCNT,          [JALU01], 1>;
24606f32e7eSjoergdefm : JWriteResIntPair<WriteTZCNT,          [JALU01], 2, [2], 2>;
24706f32e7eSjoerg
24806f32e7eSjoerg// BMI1 BEXTR/BLS, BMI2 BZHI
24906f32e7eSjoergdefm : JWriteResIntPair<WriteBEXTR, [JALU01], 1>;
25006f32e7eSjoergdefm : JWriteResIntPair<WriteBLS,   [JALU01], 2, [2], 2>;
25106f32e7eSjoergdefm : X86WriteResPairUnsupported<WriteBZHI>;
25206f32e7eSjoerg
25306f32e7eSjoerg////////////////////////////////////////////////////////////////////////////////
25406f32e7eSjoerg// Integer shifts and rotates.
25506f32e7eSjoerg////////////////////////////////////////////////////////////////////////////////
25606f32e7eSjoerg
25706f32e7eSjoergdefm : JWriteResIntPair<WriteShift,    [JALU01], 1>;
25806f32e7eSjoergdefm : JWriteResIntPair<WriteShiftCL,  [JALU01], 1>;
25906f32e7eSjoergdefm : JWriteResIntPair<WriteRotate,   [JALU01], 1>;
26006f32e7eSjoergdefm : JWriteResIntPair<WriteRotateCL, [JALU01], 1>;
26106f32e7eSjoerg
26206f32e7eSjoerg// SHLD/SHRD.
26306f32e7eSjoergdefm : X86WriteRes<WriteSHDrri, [JALU01], 3, [6], 6>;
26406f32e7eSjoergdefm : X86WriteRes<WriteSHDrrcl,[JALU01], 4, [8], 7>;
26506f32e7eSjoergdefm : X86WriteRes<WriteSHDmri, [JLAGU, JALU01], 9, [1, 22], 8>;
26606f32e7eSjoergdefm : X86WriteRes<WriteSHDmrcl,[JLAGU, JALU01], 9, [1, 22], 8>;
26706f32e7eSjoerg
26806f32e7eSjoerg////////////////////////////////////////////////////////////////////////////////
26906f32e7eSjoerg// Loads, stores, and moves, not folded with other operations.
27006f32e7eSjoerg////////////////////////////////////////////////////////////////////////////////
27106f32e7eSjoerg
27206f32e7eSjoergdef : WriteRes<WriteLoad,    [JLAGU]> { let Latency = 3; }
27306f32e7eSjoergdef : WriteRes<WriteStore,   [JSAGU]>;
27406f32e7eSjoergdef : WriteRes<WriteStoreNT, [JSAGU]>;
27506f32e7eSjoergdef : WriteRes<WriteMove,    [JALU01]>;
27606f32e7eSjoerg
27706f32e7eSjoerg// Load/store MXCSR.
27806f32e7eSjoergdef : WriteRes<WriteLDMXCSR, [JLAGU]> { let Latency = 3; }
27906f32e7eSjoergdef : WriteRes<WriteSTMXCSR, [JSAGU]>;
28006f32e7eSjoerg
28106f32e7eSjoerg// Treat misc copies as a move.
28206f32e7eSjoergdef : InstRW<[WriteMove], (instrs COPY)>;
28306f32e7eSjoerg
28406f32e7eSjoerg////////////////////////////////////////////////////////////////////////////////
28506f32e7eSjoerg// Idioms that clear a register, like xorps %xmm0, %xmm0.
28606f32e7eSjoerg// These can often bypass execution ports completely.
28706f32e7eSjoerg////////////////////////////////////////////////////////////////////////////////
28806f32e7eSjoerg
28906f32e7eSjoergdef : WriteRes<WriteZero,  []>;
29006f32e7eSjoerg
29106f32e7eSjoerg////////////////////////////////////////////////////////////////////////////////
29206f32e7eSjoerg// Branches don't produce values, so they have no latency, but they still
29306f32e7eSjoerg// consume resources. Indirect branches can fold loads.
29406f32e7eSjoerg////////////////////////////////////////////////////////////////////////////////
29506f32e7eSjoerg
29606f32e7eSjoergdefm : JWriteResIntPair<WriteJump,  [JALU01], 1>;
29706f32e7eSjoerg
29806f32e7eSjoerg////////////////////////////////////////////////////////////////////////////////
29906f32e7eSjoerg// Special case scheduling classes.
30006f32e7eSjoerg////////////////////////////////////////////////////////////////////////////////
30106f32e7eSjoerg
30206f32e7eSjoergdef : WriteRes<WriteSystem,     [JALU01]> { let Latency = 100; }
30306f32e7eSjoergdef : WriteRes<WriteMicrocoded, [JALU01]> { let Latency = 100; }
30406f32e7eSjoergdef : WriteRes<WriteFence,  [JSAGU]>;
30506f32e7eSjoerg
30606f32e7eSjoerg// Nops don't have dependencies, so there's no actual latency, but we set this
30706f32e7eSjoerg// to '1' to tell the scheduler that the nop uses an ALU slot for a cycle.
30806f32e7eSjoergdef : WriteRes<WriteNop, [JALU01]> { let Latency = 1; }
30906f32e7eSjoerg
31006f32e7eSjoergdef JWriteCMPXCHG8rr : SchedWriteRes<[JALU01]> {
31106f32e7eSjoerg  let Latency = 3;
31206f32e7eSjoerg  let ResourceCycles = [3];
31306f32e7eSjoerg  let NumMicroOps = 3;
31406f32e7eSjoerg}
31506f32e7eSjoerg
31606f32e7eSjoergdef JWriteLOCK_CMPXCHG8rm : SchedWriteRes<[JALU01, JLAGU, JSAGU]> {
31706f32e7eSjoerg  let Latency = 16;
31806f32e7eSjoerg  let ResourceCycles = [3,16,16];
31906f32e7eSjoerg  let NumMicroOps = 5;
32006f32e7eSjoerg}
32106f32e7eSjoerg
32206f32e7eSjoergdef JWriteLOCK_CMPXCHGrm : SchedWriteRes<[JALU01, JLAGU, JSAGU]> {
32306f32e7eSjoerg  let Latency = 17;
32406f32e7eSjoerg  let ResourceCycles = [3,17,17];
32506f32e7eSjoerg  let NumMicroOps = 6;
32606f32e7eSjoerg}
32706f32e7eSjoerg
32806f32e7eSjoergdef JWriteCMPXCHG8rm : SchedWriteRes<[JALU01, JLAGU, JSAGU]> {
32906f32e7eSjoerg  let Latency = 11;
33006f32e7eSjoerg  let ResourceCycles = [3,1,1];
33106f32e7eSjoerg  let NumMicroOps = 5;
33206f32e7eSjoerg}
33306f32e7eSjoerg
33406f32e7eSjoergdef JWriteCMPXCHG8B : SchedWriteRes<[JALU01, JLAGU, JSAGU]> {
33506f32e7eSjoerg  let Latency = 11;
33606f32e7eSjoerg  let ResourceCycles = [3,1,1];
33706f32e7eSjoerg  let NumMicroOps = 18;
33806f32e7eSjoerg}
33906f32e7eSjoerg
34006f32e7eSjoergdef JWriteCMPXCHG16B : SchedWriteRes<[JALU01, JLAGU, JSAGU]> {
34106f32e7eSjoerg  let Latency = 32;
34206f32e7eSjoerg  let ResourceCycles = [6,1,1];
34306f32e7eSjoerg  let NumMicroOps = 28;
34406f32e7eSjoerg}
34506f32e7eSjoerg
34606f32e7eSjoergdef JWriteLOCK_CMPXCHG8B : SchedWriteRes<[JALU01, JLAGU, JSAGU]> {
34706f32e7eSjoerg  let Latency = 19;
34806f32e7eSjoerg  let ResourceCycles = [3,19,19];
34906f32e7eSjoerg  let NumMicroOps = 18;
35006f32e7eSjoerg}
35106f32e7eSjoerg
35206f32e7eSjoergdef JWriteLOCK_CMPXCHG16B : SchedWriteRes<[JALU01, JLAGU, JSAGU]> {
35306f32e7eSjoerg  let Latency = 38;
35406f32e7eSjoerg  let ResourceCycles = [6,38,38];
35506f32e7eSjoerg  let NumMicroOps = 28;
35606f32e7eSjoerg}
35706f32e7eSjoerg
35806f32e7eSjoergdef JWriteCMPXCHGVariant :  SchedWriteVariant<[
35906f32e7eSjoerg  SchedVar<MCSchedPredicate<IsAtomicCompareAndSwap8B>,  [JWriteLOCK_CMPXCHG8B]>,
36006f32e7eSjoerg  SchedVar<MCSchedPredicate<IsAtomicCompareAndSwap16B>, [JWriteLOCK_CMPXCHG16B]>,
36106f32e7eSjoerg  SchedVar<MCSchedPredicate<IsAtomicCompareAndSwap_8>,  [JWriteLOCK_CMPXCHG8rm]>,
36206f32e7eSjoerg  SchedVar<MCSchedPredicate<IsAtomicCompareAndSwap>,    [JWriteLOCK_CMPXCHGrm]>,
36306f32e7eSjoerg  SchedVar<MCSchedPredicate<IsCompareAndSwap8B>,        [JWriteCMPXCHG8B]>,
36406f32e7eSjoerg  SchedVar<MCSchedPredicate<IsCompareAndSwap16B>,       [JWriteCMPXCHG16B]>,
36506f32e7eSjoerg  SchedVar<MCSchedPredicate<IsRegMemCompareAndSwap_8>,  [JWriteCMPXCHG8rm]>,
36606f32e7eSjoerg  SchedVar<MCSchedPredicate<IsRegMemCompareAndSwap>,    [WriteCMPXCHGRMW]>,
36706f32e7eSjoerg  SchedVar<MCSchedPredicate<IsRegRegCompareAndSwap_8>,  [JWriteCMPXCHG8rr]>,
36806f32e7eSjoerg  SchedVar<NoSchedPred,                                 [WriteCMPXCHG]>
36906f32e7eSjoerg]>;
37006f32e7eSjoerg
37106f32e7eSjoerg// The first five reads are contributed by the memory load operand.
37206f32e7eSjoerg// We ignore those reads and set a read-advance for the other input operands
37306f32e7eSjoerg// including the implicit read of RAX.
37406f32e7eSjoergdef : InstRW<[JWriteCMPXCHGVariant,
37506f32e7eSjoerg              ReadDefault, ReadDefault, ReadDefault, ReadDefault, ReadDefault,
37606f32e7eSjoerg              ReadAfterLd, ReadAfterLd], (instrs LCMPXCHG8, LCMPXCHG16,
37706f32e7eSjoerg                                                 LCMPXCHG32, LCMPXCHG64,
37806f32e7eSjoerg                                                 CMPXCHG8rm, CMPXCHG16rm,
37906f32e7eSjoerg                                                 CMPXCHG32rm, CMPXCHG64rm)>;
38006f32e7eSjoerg
38106f32e7eSjoergdef : InstRW<[JWriteCMPXCHGVariant], (instrs CMPXCHG8rr, CMPXCHG16rr,
38206f32e7eSjoerg                                             CMPXCHG32rr, CMPXCHG64rr)>;
38306f32e7eSjoerg
38406f32e7eSjoergdef : InstRW<[JWriteCMPXCHGVariant,
38506f32e7eSjoerg              // Ignore reads contributed by the memory operand.
38606f32e7eSjoerg              ReadDefault, ReadDefault, ReadDefault, ReadDefault, ReadDefault,
38706f32e7eSjoerg              // Add a read-advance to every implicit register read.
38806f32e7eSjoerg              ReadAfterLd, ReadAfterLd, ReadAfterLd, ReadAfterLd], (instrs LCMPXCHG8B, LCMPXCHG16B,
38906f32e7eSjoerg                                                                           CMPXCHG8B, CMPXCHG16B)>;
39006f32e7eSjoerg
39106f32e7eSjoergdef JWriteLOCK_ALURMW : SchedWriteRes<[JALU01, JLAGU, JSAGU]> {
39206f32e7eSjoerg  let Latency = 19;
39306f32e7eSjoerg  let ResourceCycles = [1,19,19];
39406f32e7eSjoerg  let NumMicroOps = 1;
39506f32e7eSjoerg}
39606f32e7eSjoerg
39706f32e7eSjoergdef JWriteLOCK_ALURMWVariant :  SchedWriteVariant<[
39806f32e7eSjoerg  SchedVar<MCSchedPredicate<CheckLockPrefix>, [JWriteLOCK_ALURMW]>,
39906f32e7eSjoerg  SchedVar<NoSchedPred,                       [WriteALURMW]>
40006f32e7eSjoerg]>;
40106f32e7eSjoergdef : InstRW<[JWriteLOCK_ALURMWVariant], (instrs INC8m, INC16m, INC32m, INC64m,
40206f32e7eSjoerg                                                 DEC8m, DEC16m, DEC32m, DEC64m,
40306f32e7eSjoerg                                                 NOT8m, NOT16m, NOT32m, NOT64m,
40406f32e7eSjoerg                                                 NEG8m, NEG16m, NEG32m, NEG64m)>;
40506f32e7eSjoerg
40606f32e7eSjoergdef JWriteXCHG8rr_XADDrr : SchedWriteRes<[JALU01]> {
40706f32e7eSjoerg  let Latency = 2;
40806f32e7eSjoerg  let ResourceCycles = [3];
40906f32e7eSjoerg  let NumMicroOps = 3;
41006f32e7eSjoerg}
41106f32e7eSjoergdef : InstRW<[JWriteXCHG8rr_XADDrr], (instrs XCHG8rr, XADD8rr, XADD16rr,
41206f32e7eSjoerg                                                      XADD32rr, XADD64rr)>;
41306f32e7eSjoerg
41406f32e7eSjoerg// This write defines the latency of the in/out register operand of a non-atomic
41506f32e7eSjoerg// XADDrm. This is the first of a pair of writes that model non-atomic
41606f32e7eSjoerg// XADDrm instructions (the second write definition is JWriteXADDrm_LdSt_Part).
41706f32e7eSjoerg//
41806f32e7eSjoerg// We need two writes because the instruction latency differs from the output
41906f32e7eSjoerg// register operand latency. In particular, the first write describes the first
42006f32e7eSjoerg// (and only) output register operand of the instruction.  However, the
42106f32e7eSjoerg// instruction latency is set to the MAX of all the write latencies. That's why
42206f32e7eSjoerg// a second write is needed in this case (see example below).
42306f32e7eSjoerg//
42406f32e7eSjoerg// Example:
42506f32e7eSjoerg//     XADD %ecx, (%rsp)      ## Instruction latency: 11cy
42606f32e7eSjoerg//                            ## ECX write Latency: 3cy
42706f32e7eSjoerg//
42806f32e7eSjoerg// Register ECX becomes available in 3 cycles. That is because the value of ECX
42906f32e7eSjoerg// is exchanged with the value read from the stack pointer, and the load-to-use
43006f32e7eSjoerg// latency is assumed to be 3cy.
43106f32e7eSjoergdef JWriteXADDrm_XCHG_Part : SchedWriteRes<[JALU01]> {
43206f32e7eSjoerg  let Latency = 3;  // load-to-use latency
43306f32e7eSjoerg  let ResourceCycles = [3];
43406f32e7eSjoerg  let NumMicroOps = 3;
43506f32e7eSjoerg}
43606f32e7eSjoerg
43706f32e7eSjoerg// This write defines the latency of the in/out register operand of an atomic
43806f32e7eSjoerg// XADDrm. This is the first of a sequence of two writes used to model atomic
43906f32e7eSjoerg// XADD instructions. The second write of the sequence is JWriteXCHGrm_LdSt_Part.
44006f32e7eSjoerg//
44106f32e7eSjoerg//
44206f32e7eSjoerg// Example:
44306f32e7eSjoerg//    LOCK XADD %ecx, (%rsp)     ## Instruction Latency: 16cy
44406f32e7eSjoerg//                               ## ECX write Latency: 11cy
44506f32e7eSjoerg//
44606f32e7eSjoerg// The value of ECX becomes available only after 11cy from the start of
44706f32e7eSjoerg// execution. This write is used to specifically set that operand latency.
44806f32e7eSjoergdef JWriteLOCK_XADDrm_XCHG_Part : SchedWriteRes<[JALU01]> {
44906f32e7eSjoerg  let Latency = 11;
45006f32e7eSjoerg  let ResourceCycles = [3];
45106f32e7eSjoerg  let NumMicroOps = 3;
45206f32e7eSjoerg}
45306f32e7eSjoerg
45406f32e7eSjoerg// This write defines the latency of the in/out register operand of an atomic
45506f32e7eSjoerg// XCHGrm. This write is the first of a sequence of two writes that describe
45606f32e7eSjoerg// atomic XCHG operations. We need two writes because the instruction latency
45706f32e7eSjoerg// differs from the output register write latency.  We want to make sure that
45806f32e7eSjoerg// the output register operand becomes visible after 11cy. However, we want to
45906f32e7eSjoerg// set the instruction latency to 16cy.
46006f32e7eSjoergdef JWriteXCHGrm_XCHG_Part : SchedWriteRes<[JALU01]> {
46106f32e7eSjoerg  let Latency = 11;
46206f32e7eSjoerg  let ResourceCycles = [2];
46306f32e7eSjoerg  let NumMicroOps = 2;
46406f32e7eSjoerg}
46506f32e7eSjoerg
46606f32e7eSjoergdef JWriteXADDrm_LdSt_Part : SchedWriteRes<[JLAGU, JSAGU]> {
46706f32e7eSjoerg  let Latency = 11;
46806f32e7eSjoerg  let ResourceCycles = [1, 1];
46906f32e7eSjoerg  let NumMicroOps = 1;
47006f32e7eSjoerg}
47106f32e7eSjoerg
47206f32e7eSjoergdef JWriteXCHGrm_LdSt_Part : SchedWriteRes<[JLAGU, JSAGU]> {
47306f32e7eSjoerg  let Latency = 16;
47406f32e7eSjoerg  let ResourceCycles = [16, 16];
47506f32e7eSjoerg  let NumMicroOps = 1;
47606f32e7eSjoerg}
47706f32e7eSjoerg
47806f32e7eSjoergdef JWriteXADDrm_Part1 : SchedWriteVariant<[
47906f32e7eSjoerg  SchedVar<MCSchedPredicate<CheckLockPrefix>, [JWriteLOCK_XADDrm_XCHG_Part]>,
48006f32e7eSjoerg  SchedVar<NoSchedPred,                       [JWriteXADDrm_XCHG_Part]>
48106f32e7eSjoerg]>;
48206f32e7eSjoerg
48306f32e7eSjoergdef JWriteXADDrm_Part2 : SchedWriteVariant<[
48406f32e7eSjoerg  SchedVar<MCSchedPredicate<CheckLockPrefix>, [JWriteXCHGrm_LdSt_Part]>,
48506f32e7eSjoerg  SchedVar<NoSchedPred,                       [JWriteXADDrm_LdSt_Part]>
48606f32e7eSjoerg]>;
48706f32e7eSjoerg
48806f32e7eSjoergdef : InstRW<[JWriteXADDrm_Part1, JWriteXADDrm_Part2, ReadAfterLd],
48906f32e7eSjoerg                 (instrs XADD8rm, XADD16rm, XADD32rm, XADD64rm,
49006f32e7eSjoerg                         LXADD8, LXADD16, LXADD32, LXADD64)>;
49106f32e7eSjoerg
49206f32e7eSjoergdef : InstRW<[JWriteXCHGrm_XCHG_Part, JWriteXCHGrm_LdSt_Part, ReadAfterLd],
49306f32e7eSjoerg                 (instrs XCHG8rm, XCHG16rm, XCHG32rm, XCHG64rm)>;
49406f32e7eSjoerg
49506f32e7eSjoerg
49606f32e7eSjoerg////////////////////////////////////////////////////////////////////////////////
49706f32e7eSjoerg// Floating point. This covers both scalar and vector operations.
49806f32e7eSjoerg////////////////////////////////////////////////////////////////////////////////
49906f32e7eSjoerg
50006f32e7eSjoergdefm : X86WriteRes<WriteFLD0,          [JFPU1, JSTC], 3, [1,1], 1>;
50106f32e7eSjoergdefm : X86WriteRes<WriteFLD1,          [JFPU1, JSTC], 3, [1,1], 1>;
50206f32e7eSjoergdefm : X86WriteRes<WriteFLDC,          [JFPU1, JSTC], 3, [1,1], 1>;
50306f32e7eSjoergdefm : X86WriteRes<WriteFLoad,         [JLAGU, JFPU01, JFPX], 5, [1, 1, 1], 1>;
50406f32e7eSjoergdefm : X86WriteRes<WriteFLoadX,        [JLAGU], 5, [1], 1>;
50506f32e7eSjoergdefm : X86WriteRes<WriteFLoadY,        [JLAGU], 5, [2], 2>;
50606f32e7eSjoergdefm : X86WriteRes<WriteFMaskedLoad,   [JLAGU, JFPU01, JFPX], 6, [1, 2, 2], 1>;
50706f32e7eSjoergdefm : X86WriteRes<WriteFMaskedLoadY,  [JLAGU, JFPU01, JFPX], 6, [2, 4, 4], 2>;
50806f32e7eSjoerg
50906f32e7eSjoergdefm : X86WriteRes<WriteFStore,        [JSAGU, JFPU1,  JSTC], 2, [1, 1, 1], 1>;
51006f32e7eSjoergdefm : X86WriteRes<WriteFStoreX,       [JSAGU, JFPU1,  JSTC], 1, [1, 1, 1], 1>;
51106f32e7eSjoergdefm : X86WriteRes<WriteFStoreY,       [JSAGU, JFPU1,  JSTC], 1, [2, 2, 2], 2>;
51206f32e7eSjoergdefm : X86WriteRes<WriteFStoreNT,      [JSAGU, JFPU1,  JSTC], 3, [1, 1, 1], 1>;
51306f32e7eSjoergdefm : X86WriteRes<WriteFStoreNTX,     [JSAGU, JFPU1,  JSTC], 3, [1, 1, 1], 1>;
51406f32e7eSjoergdefm : X86WriteRes<WriteFStoreNTY,     [JSAGU, JFPU1,  JSTC], 3, [2, 2, 2], 1>;
51506f32e7eSjoerg
51606f32e7eSjoergdefm : X86WriteRes<WriteFMaskedStore32,  [JFPU0, JFPA, JFPU1, JSTC, JLAGU, JSAGU, JALU01], 16, [1,1, 5, 5,4,4,4], 19>;
51706f32e7eSjoergdefm : X86WriteRes<WriteFMaskedStore64,  [JFPU0, JFPA, JFPU1, JSTC, JLAGU, JSAGU, JALU01], 13, [1,1, 2, 2,2,2,2], 10>;
51806f32e7eSjoergdefm : X86WriteRes<WriteFMaskedStore32Y, [JFPU0, JFPA, JFPU1, JSTC, JLAGU, JSAGU, JALU01], 22, [1,1,10,10,8,8,8], 36>;
51906f32e7eSjoergdefm : X86WriteRes<WriteFMaskedStore64Y, [JFPU0, JFPA, JFPU1, JSTC, JLAGU, JSAGU, JALU01], 16, [1,1, 4, 4,4,4,4], 18>;
52006f32e7eSjoerg
52106f32e7eSjoergdefm : X86WriteRes<WriteFMove,         [JFPU01, JFPX], 1, [1, 1], 1>;
52206f32e7eSjoergdefm : X86WriteRes<WriteFMoveX,        [JFPU01, JFPX], 1, [1, 1], 1>;
52306f32e7eSjoergdefm : X86WriteRes<WriteFMoveY,        [JFPU01, JFPX], 1, [2, 2], 2>;
52406f32e7eSjoerg
52506f32e7eSjoergdefm : X86WriteRes<WriteEMMS,          [JFPU01, JFPX], 2, [1, 1], 1>;
52606f32e7eSjoerg
52706f32e7eSjoergdefm : JWriteResFpuPair<WriteFAdd,         [JFPU0, JFPA],  3>;
52806f32e7eSjoergdefm : JWriteResFpuPair<WriteFAddX,        [JFPU0, JFPA],  3>;
52906f32e7eSjoergdefm : JWriteResYMMPair<WriteFAddY,        [JFPU0, JFPA],  3, [2,2], 2>;
53006f32e7eSjoergdefm : X86WriteResPairUnsupported<WriteFAddZ>;
53106f32e7eSjoergdefm : JWriteResFpuPair<WriteFAdd64,       [JFPU0, JFPA],  3>;
53206f32e7eSjoergdefm : JWriteResFpuPair<WriteFAdd64X,      [JFPU0, JFPA],  3>;
53306f32e7eSjoergdefm : JWriteResYMMPair<WriteFAdd64Y,      [JFPU0, JFPA],  3, [2,2], 2>;
53406f32e7eSjoergdefm : X86WriteResPairUnsupported<WriteFAdd64Z>;
53506f32e7eSjoergdefm : JWriteResFpuPair<WriteFCmp,         [JFPU0, JFPA],  2>;
53606f32e7eSjoergdefm : JWriteResFpuPair<WriteFCmpX,        [JFPU0, JFPA],  2>;
53706f32e7eSjoergdefm : JWriteResYMMPair<WriteFCmpY,        [JFPU0, JFPA],  2, [2,2], 2>;
53806f32e7eSjoergdefm : X86WriteResPairUnsupported<WriteFCmpZ>;
53906f32e7eSjoergdefm : JWriteResFpuPair<WriteFCmp64,       [JFPU0, JFPA],  2>;
54006f32e7eSjoergdefm : JWriteResFpuPair<WriteFCmp64X,      [JFPU0, JFPA],  2>;
54106f32e7eSjoergdefm : JWriteResYMMPair<WriteFCmp64Y,      [JFPU0, JFPA],  2, [2,2], 2>;
54206f32e7eSjoergdefm : X86WriteResPairUnsupported<WriteFCmp64Z>;
54306f32e7eSjoergdefm : JWriteResFpuPair<WriteFCom,  [JFPU0, JFPA, JALU0],  3>;
544*da58b97aSjoergdefm : JWriteResFpuPair<WriteFComX, [JFPU0, JFPA, JALU0],  3>;
54506f32e7eSjoergdefm : JWriteResFpuPair<WriteFMul,         [JFPU1, JFPM],  2>;
54606f32e7eSjoergdefm : JWriteResFpuPair<WriteFMulX,        [JFPU1, JFPM],  2>;
54706f32e7eSjoergdefm : JWriteResYMMPair<WriteFMulY,        [JFPU1, JFPM],  2, [2,2], 2>;
54806f32e7eSjoergdefm : X86WriteResPairUnsupported<WriteFMulZ>;
54906f32e7eSjoergdefm : JWriteResFpuPair<WriteFMul64,       [JFPU1, JFPM],  4, [1,2]>;
55006f32e7eSjoergdefm : JWriteResFpuPair<WriteFMul64X,      [JFPU1, JFPM],  4, [1,2]>;
55106f32e7eSjoergdefm : JWriteResYMMPair<WriteFMul64Y,      [JFPU1, JFPM],  4, [2,4], 2>;
55206f32e7eSjoergdefm : X86WriteResPairUnsupported<WriteFMul64Z>;
55306f32e7eSjoergdefm : X86WriteResPairUnsupported<WriteFMA>;
55406f32e7eSjoergdefm : X86WriteResPairUnsupported<WriteFMAX>;
55506f32e7eSjoergdefm : X86WriteResPairUnsupported<WriteFMAY>;
55606f32e7eSjoergdefm : X86WriteResPairUnsupported<WriteFMAZ>;
55706f32e7eSjoergdefm : JWriteResFpuPair<WriteDPPD,   [JFPU1, JFPM, JFPA],  9, [1, 3, 3],  3>;
55806f32e7eSjoergdefm : JWriteResFpuPair<WriteDPPS,   [JFPU1, JFPM, JFPA], 11, [1, 3, 3],  5>;
55906f32e7eSjoergdefm : JWriteResYMMPair<WriteDPPSY,  [JFPU1, JFPM, JFPA], 12, [2, 6, 6], 10>;
56006f32e7eSjoergdefm : X86WriteResPairUnsupported<WriteDPPSZ>;
56106f32e7eSjoergdefm : JWriteResFpuPair<WriteFRcp,         [JFPU1, JFPM],  2>;
56206f32e7eSjoergdefm : JWriteResFpuPair<WriteFRcpX,        [JFPU1, JFPM],  2>;
56306f32e7eSjoergdefm : JWriteResYMMPair<WriteFRcpY,        [JFPU1, JFPM],  2, [2,2], 2>;
56406f32e7eSjoergdefm : X86WriteResPairUnsupported<WriteFRcpZ>;
56506f32e7eSjoergdefm : JWriteResFpuPair<WriteFRsqrt,       [JFPU1, JFPM],  2>;
56606f32e7eSjoergdefm : JWriteResFpuPair<WriteFRsqrtX,      [JFPU1, JFPM],  2>;
56706f32e7eSjoergdefm : JWriteResYMMPair<WriteFRsqrtY,      [JFPU1, JFPM],  2, [2,2], 2>;
56806f32e7eSjoergdefm : X86WriteResPairUnsupported<WriteFRsqrtZ>;
56906f32e7eSjoergdefm : JWriteResFpuPair<WriteFDiv,         [JFPU1, JFPM], 19, [1, 19]>;
57006f32e7eSjoergdefm : JWriteResFpuPair<WriteFDivX,        [JFPU1, JFPM], 19, [1, 19]>;
57106f32e7eSjoergdefm : JWriteResYMMPair<WriteFDivY,        [JFPU1, JFPM], 38, [2, 38], 2>;
57206f32e7eSjoergdefm : X86WriteResPairUnsupported<WriteFDivZ>;
57306f32e7eSjoergdefm : JWriteResFpuPair<WriteFDiv64,       [JFPU1, JFPM], 19, [1, 19]>;
57406f32e7eSjoergdefm : JWriteResFpuPair<WriteFDiv64X,      [JFPU1, JFPM], 19, [1, 19]>;
57506f32e7eSjoergdefm : JWriteResYMMPair<WriteFDiv64Y,      [JFPU1, JFPM], 38, [2, 38], 2>;
57606f32e7eSjoergdefm : X86WriteResPairUnsupported<WriteFDiv64Z>;
57706f32e7eSjoergdefm : JWriteResFpuPair<WriteFSqrt,        [JFPU1, JFPM], 21, [1, 21]>;
57806f32e7eSjoergdefm : JWriteResFpuPair<WriteFSqrtX,       [JFPU1, JFPM], 21, [1, 21]>;
57906f32e7eSjoergdefm : JWriteResYMMPair<WriteFSqrtY,       [JFPU1, JFPM], 42, [2, 42], 2>;
58006f32e7eSjoergdefm : X86WriteResPairUnsupported<WriteFSqrtZ>;
58106f32e7eSjoergdefm : JWriteResFpuPair<WriteFSqrt64,      [JFPU1, JFPM], 27, [1, 27]>;
58206f32e7eSjoergdefm : JWriteResFpuPair<WriteFSqrt64X,     [JFPU1, JFPM], 27, [1, 27]>;
58306f32e7eSjoergdefm : JWriteResYMMPair<WriteFSqrt64Y,     [JFPU1, JFPM], 54, [2, 54], 2>;
58406f32e7eSjoergdefm : X86WriteResPairUnsupported<WriteFSqrt64Z>;
58506f32e7eSjoergdefm : JWriteResFpuPair<WriteFSqrt80,      [JFPU1, JFPM], 35, [1, 35]>;
58606f32e7eSjoergdefm : JWriteResFpuPair<WriteFSign,        [JFPU1, JFPM],  2>;
58706f32e7eSjoergdefm : JWriteResFpuPair<WriteFRnd,         [JFPU1, JSTC],  3>;
58806f32e7eSjoergdefm : JWriteResYMMPair<WriteFRndY,        [JFPU1, JSTC],  3, [2,2], 2>;
58906f32e7eSjoergdefm : X86WriteResPairUnsupported<WriteFRndZ>;
59006f32e7eSjoergdefm : JWriteResFpuPair<WriteFLogic,      [JFPU01, JFPX],  1>;
59106f32e7eSjoergdefm : JWriteResYMMPair<WriteFLogicY,     [JFPU01, JFPX],  1, [2, 2], 2>;
59206f32e7eSjoergdefm : X86WriteResPairUnsupported<WriteFLogicZ>;
59306f32e7eSjoergdefm : JWriteResFpuPair<WriteFTest,       [JFPU0, JFPA, JALU0], 3>;
59406f32e7eSjoergdefm : JWriteResYMMPair<WriteFTestY ,     [JFPU01, JFPX, JFPA, JALU0], 4, [2, 2, 2, 1], 3>;
59506f32e7eSjoergdefm : X86WriteResPairUnsupported<WriteFTestZ>;
59606f32e7eSjoergdefm : JWriteResFpuPair<WriteFShuffle,    [JFPU01, JFPX],  1>;
59706f32e7eSjoergdefm : JWriteResYMMPair<WriteFShuffleY,   [JFPU01, JFPX],  1, [2, 2], 2>;
59806f32e7eSjoergdefm : X86WriteResPairUnsupported<WriteFShuffleZ>;
59906f32e7eSjoergdefm : JWriteResFpuPair<WriteFVarShuffle, [JFPU01, JFPX],  3, [1, 4], 3>; // +1cy latency.
60006f32e7eSjoergdefm : JWriteResYMMPair<WriteFVarShuffleY,[JFPU01, JFPX],  4, [2, 6], 6>; // +1cy latency.
60106f32e7eSjoergdefm : X86WriteResPairUnsupported<WriteFVarShuffleZ>;
60206f32e7eSjoergdefm : JWriteResFpuPair<WriteFBlend,      [JFPU01, JFPX],  1>;
60306f32e7eSjoergdefm : JWriteResYMMPair<WriteFBlendY,     [JFPU01, JFPX],  1, [2, 2], 2>;
60406f32e7eSjoergdefm : X86WriteResPairUnsupported<WriteFBlendZ>;
60506f32e7eSjoergdefm : JWriteResFpuPair<WriteFVarBlend,   [JFPU01, JFPX],  2, [4, 4], 3>;
60606f32e7eSjoergdefm : JWriteResYMMPair<WriteFVarBlendY,  [JFPU01, JFPX],  3, [6, 6], 6>;
60706f32e7eSjoergdefm : X86WriteResPairUnsupported<WriteFVarBlendZ>;
60806f32e7eSjoergdefm : JWriteResFpuPair<WriteFShuffle256, [JFPU01, JFPX],  1, [2, 2], 2>;
60906f32e7eSjoergdefm : X86WriteResPairUnsupported<WriteFVarShuffle256>;
61006f32e7eSjoerg
61106f32e7eSjoerg////////////////////////////////////////////////////////////////////////////////
61206f32e7eSjoerg// Conversions.
61306f32e7eSjoerg////////////////////////////////////////////////////////////////////////////////
61406f32e7eSjoerg
61506f32e7eSjoergdefm : JWriteResFpuPair<WriteCvtSS2I,      [JFPU1, JSTC, JFPU0, JFPA, JALU0], 7, [1,1,1,1,1], 2>;
61606f32e7eSjoergdefm : JWriteResFpuPair<WriteCvtPS2I,      [JFPU1, JSTC], 3, [1,1], 1>;
61706f32e7eSjoergdefm : JWriteResYMMPair<WriteCvtPS2IY,     [JFPU1, JSTC], 3, [2,2], 2>;
61806f32e7eSjoergdefm : X86WriteResPairUnsupported<WriteCvtPS2IZ>;
61906f32e7eSjoergdefm : JWriteResFpuPair<WriteCvtSD2I,      [JFPU1, JSTC, JFPU0, JFPA, JALU0], 7, [1,1,1,1,1], 2>;
62006f32e7eSjoergdefm : JWriteResFpuPair<WriteCvtPD2I,      [JFPU1, JSTC], 3, [1,1], 1>;
62106f32e7eSjoergdefm : JWriteResYMMPair<WriteCvtPD2IY,     [JFPU1, JSTC, JFPX], 6, [2,2,4], 3>;
62206f32e7eSjoergdefm : X86WriteResPairUnsupported<WriteCvtPD2IZ>;
62306f32e7eSjoerg
62406f32e7eSjoergdefm : X86WriteRes<WriteCvtI2SS,           [JFPU1, JSTC], 4, [1,1], 2>;
62506f32e7eSjoergdefm : X86WriteRes<WriteCvtI2SSLd,         [JLAGU, JFPU1, JSTC], 9, [1,1,1], 1>;
62606f32e7eSjoergdefm : JWriteResFpuPair<WriteCvtI2PS,      [JFPU1, JSTC], 3, [1,1], 1>;
62706f32e7eSjoergdefm : JWriteResYMMPair<WriteCvtI2PSY,     [JFPU1, JSTC], 3, [2,2], 2>;
62806f32e7eSjoergdefm : X86WriteResPairUnsupported<WriteCvtI2PSZ>;
62906f32e7eSjoergdefm : X86WriteRes<WriteCvtI2SD,           [JFPU1, JSTC], 4, [1,1], 2>;
63006f32e7eSjoergdefm : X86WriteRes<WriteCvtI2SDLd,         [JLAGU, JFPU1, JSTC], 9, [1,1,1], 1>;
63106f32e7eSjoergdefm : JWriteResFpuPair<WriteCvtI2PD,      [JFPU1, JSTC], 3, [1,1], 1>;
63206f32e7eSjoergdefm : JWriteResYMMPair<WriteCvtI2PDY,     [JFPU1, JSTC], 3, [2,2], 2>;
63306f32e7eSjoergdefm : X86WriteResPairUnsupported<WriteCvtI2PDZ>;
63406f32e7eSjoerg
63506f32e7eSjoergdefm : JWriteResFpuPair<WriteCvtSS2SD,      [JFPU1, JSTC], 7, [1,2], 2>;
63606f32e7eSjoergdefm : JWriteResFpuPair<WriteCvtPS2PD,      [JFPU1, JSTC], 2, [1,1], 1>;
63706f32e7eSjoergdefm : JWriteResYMMPair<WriteCvtPS2PDY,     [JFPU1, JSTC], 2, [2,2], 2>;
63806f32e7eSjoergdefm : X86WriteResPairUnsupported<WriteCvtPS2PDZ>;
63906f32e7eSjoerg
64006f32e7eSjoergdefm : JWriteResFpuPair<WriteCvtSD2SS,    [JFPU1, JSTC], 7, [1,2], 2>;
64106f32e7eSjoergdefm : JWriteResFpuPair<WriteCvtPD2PS,    [JFPU1, JSTC], 3, [1,1], 1>;
64206f32e7eSjoergdefm : JWriteResYMMPair<WriteCvtPD2PSY,   [JFPU1, JSTC, JFPX], 6, [2,2,4], 3>;
64306f32e7eSjoergdefm : X86WriteResPairUnsupported<WriteCvtPD2PSZ>;
64406f32e7eSjoerg
64506f32e7eSjoergdefm : JWriteResFpuPair<WriteCvtPH2PS,     [JFPU1, JSTC], 3, [1,1], 1>;
64606f32e7eSjoergdefm : JWriteResYMMPair<WriteCvtPH2PSY,    [JFPU1, JSTC], 3, [2,2], 2>;
64706f32e7eSjoergdefm : X86WriteResPairUnsupported<WriteCvtPH2PSZ>;
64806f32e7eSjoerg
64906f32e7eSjoergdefm : X86WriteRes<WriteCvtPS2PH,                 [JFPU1, JSTC], 3, [1,1], 1>;
65006f32e7eSjoergdefm : X86WriteRes<WriteCvtPS2PHY,          [JFPU1, JSTC, JFPX], 6, [2,2,2], 3>;
65106f32e7eSjoergdefm : X86WriteResUnsupported<WriteCvtPS2PHZ>;
65206f32e7eSjoergdefm : X86WriteRes<WriteCvtPS2PHSt,        [JFPU1, JSTC, JSAGU], 4, [1,1,1], 1>;
65306f32e7eSjoergdefm : X86WriteRes<WriteCvtPS2PHYSt, [JFPU1, JSTC, JFPX, JSAGU], 7, [2,2,2,1], 3>;
65406f32e7eSjoergdefm : X86WriteResUnsupported<WriteCvtPS2PHZSt>;
65506f32e7eSjoerg
65606f32e7eSjoerg////////////////////////////////////////////////////////////////////////////////
65706f32e7eSjoerg// Vector integer operations.
65806f32e7eSjoerg////////////////////////////////////////////////////////////////////////////////
65906f32e7eSjoerg
66006f32e7eSjoergdefm : X86WriteRes<WriteVecLoad,          [JLAGU, JFPU01, JVALU], 5, [1, 1, 1], 1>;
66106f32e7eSjoergdefm : X86WriteRes<WriteVecLoadX,         [JLAGU], 5, [1], 1>;
66206f32e7eSjoergdefm : X86WriteRes<WriteVecLoadY,         [JLAGU], 5, [2], 2>;
66306f32e7eSjoergdefm : X86WriteRes<WriteVecLoadNT,        [JLAGU, JFPU01, JVALU], 5, [1, 1, 1], 1>;
66406f32e7eSjoergdefm : X86WriteRes<WriteVecLoadNTY,       [JLAGU, JFPU01, JVALU], 5, [1, 1, 1], 1>;
66506f32e7eSjoergdefm : X86WriteRes<WriteVecMaskedLoad,    [JLAGU, JFPU01, JVALU], 6, [1, 2, 2], 1>;
66606f32e7eSjoergdefm : X86WriteRes<WriteVecMaskedLoadY,   [JLAGU, JFPU01, JVALU], 6, [2, 4, 4], 2>;
66706f32e7eSjoerg
66806f32e7eSjoergdefm : X86WriteRes<WriteVecStore,         [JSAGU, JFPU1,   JSTC], 2, [1, 1, 1], 1>;
66906f32e7eSjoergdefm : X86WriteRes<WriteVecStoreX,        [JSAGU, JFPU1,   JSTC], 1, [1, 1, 1], 1>;
67006f32e7eSjoergdefm : X86WriteRes<WriteVecStoreY,        [JSAGU, JFPU1,   JSTC], 1, [2, 2, 2], 2>;
67106f32e7eSjoergdefm : X86WriteRes<WriteVecStoreNT,       [JSAGU, JFPU1,   JSTC], 2, [1, 1, 1], 1>;
67206f32e7eSjoergdefm : X86WriteRes<WriteVecStoreNTY,      [JSAGU, JFPU1,   JSTC], 2, [2, 2, 2], 1>;
673*da58b97aSjoergdefm : X86WriteResUnsupported<WriteVecMaskedStore32>;
674*da58b97aSjoergdefm : X86WriteResUnsupported<WriteVecMaskedStore64>;
675*da58b97aSjoergdefm : X86WriteResUnsupported<WriteVecMaskedStore32Y>;
676*da58b97aSjoergdefm : X86WriteResUnsupported<WriteVecMaskedStore64Y>;
67706f32e7eSjoerg
67806f32e7eSjoergdefm : X86WriteRes<WriteVecMove,          [JFPU01, JVALU], 1, [1, 1], 1>;
67906f32e7eSjoergdefm : X86WriteRes<WriteVecMoveX,         [JFPU01, JVALU], 1, [1, 1], 1>;
68006f32e7eSjoergdefm : X86WriteRes<WriteVecMoveY,         [JFPU01, JVALU], 1, [2, 2], 2>;
68106f32e7eSjoergdefm : X86WriteRes<WriteVecMoveToGpr,     [JFPU0, JFPA, JALU0], 4, [1, 1, 1], 1>;
68206f32e7eSjoergdefm : X86WriteRes<WriteVecMoveFromGpr,   [JFPU01, JFPX], 8, [1, 1], 2>;
68306f32e7eSjoerg
68406f32e7eSjoergdefm : JWriteResFpuPair<WriteVecALU,      [JFPU01, JVALU], 1>;
68506f32e7eSjoergdefm : JWriteResFpuPair<WriteVecALUX,     [JFPU01, JVALU], 1>;
68606f32e7eSjoergdefm : X86WriteResPairUnsupported<WriteVecALUY>;
68706f32e7eSjoergdefm : X86WriteResPairUnsupported<WriteVecALUZ>;
68806f32e7eSjoergdefm : JWriteResFpuPair<WriteVecShift,    [JFPU01, JVALU], 1>;
68906f32e7eSjoergdefm : JWriteResFpuPair<WriteVecShiftX,   [JFPU01, JVALU], 2>; // +1cy latency.
69006f32e7eSjoergdefm : X86WriteResPairUnsupported<WriteVecShiftY>;
69106f32e7eSjoergdefm : X86WriteResPairUnsupported<WriteVecShiftZ>;
69206f32e7eSjoergdefm : JWriteResFpuPair<WriteVecShiftImm, [JFPU01, JVALU], 1>;
69306f32e7eSjoergdefm : JWriteResFpuPair<WriteVecShiftImmX,[JFPU01, JVALU], 2>; // +1cy latency.
69406f32e7eSjoergdefm : X86WriteResPairUnsupported<WriteVecShiftImmY>;
69506f32e7eSjoergdefm : X86WriteResPairUnsupported<WriteVecShiftImmZ>;
69606f32e7eSjoergdefm : X86WriteResPairUnsupported<WriteVarVecShift>;
69706f32e7eSjoergdefm : X86WriteResPairUnsupported<WriteVarVecShiftY>;
69806f32e7eSjoergdefm : X86WriteResPairUnsupported<WriteVarVecShiftZ>;
69906f32e7eSjoergdefm : JWriteResFpuPair<WriteVecIMul,     [JFPU0, JVIMUL], 2>;
70006f32e7eSjoergdefm : JWriteResFpuPair<WriteVecIMulX,    [JFPU0, JVIMUL], 2>;
70106f32e7eSjoergdefm : X86WriteResPairUnsupported<WriteVecIMulY>;
70206f32e7eSjoergdefm : X86WriteResPairUnsupported<WriteVecIMulZ>;
70306f32e7eSjoergdefm : JWriteResFpuPair<WritePMULLD,      [JFPU0, JFPU01, JVIMUL, JVALU], 4, [2, 1, 2, 1], 3>;
70406f32e7eSjoergdefm : X86WriteResPairUnsupported<WritePMULLDY>;
70506f32e7eSjoergdefm : X86WriteResPairUnsupported<WritePMULLDZ>;
70606f32e7eSjoergdefm : JWriteResFpuPair<WriteMPSAD,       [JFPU0, JVIMUL], 3, [1, 2], 3>;
70706f32e7eSjoergdefm : X86WriteResPairUnsupported<WriteMPSADY>;
70806f32e7eSjoergdefm : X86WriteResPairUnsupported<WriteMPSADZ>;
70906f32e7eSjoergdefm : JWriteResFpuPair<WritePSADBW,      [JFPU01, JVALU], 2>;
71006f32e7eSjoergdefm : JWriteResFpuPair<WritePSADBWX,     [JFPU01, JVALU], 2>;
71106f32e7eSjoergdefm : X86WriteResPairUnsupported<WritePSADBWY>;
71206f32e7eSjoergdefm : X86WriteResPairUnsupported<WritePSADBWZ>;
71306f32e7eSjoergdefm : JWriteResFpuPair<WritePHMINPOS,    [JFPU01, JVALU], 2>;
71406f32e7eSjoergdefm : JWriteResFpuPair<WriteShuffle,     [JFPU01, JVALU], 1>;
71506f32e7eSjoergdefm : JWriteResFpuPair<WriteShuffleX,    [JFPU01, JVALU], 1>;
71606f32e7eSjoergdefm : X86WriteResPairUnsupported<WriteShuffleY>;
71706f32e7eSjoergdefm : X86WriteResPairUnsupported<WriteShuffleZ>;
71806f32e7eSjoergdefm : JWriteResFpuPair<WriteVarShuffle,  [JFPU01, JVALU], 2, [1, 1], 1>;
71906f32e7eSjoergdefm : JWriteResFpuPair<WriteVarShuffleX, [JFPU01, JVALU], 2, [1, 4], 3>;
72006f32e7eSjoergdefm : X86WriteResPairUnsupported<WriteVarShuffleY>;
72106f32e7eSjoergdefm : X86WriteResPairUnsupported<WriteVarShuffleZ>;
72206f32e7eSjoergdefm : JWriteResFpuPair<WriteBlend,       [JFPU01, JVALU], 1>;
72306f32e7eSjoergdefm : X86WriteResPairUnsupported<WriteBlendY>;
72406f32e7eSjoergdefm : X86WriteResPairUnsupported<WriteBlendZ>;
72506f32e7eSjoergdefm : JWriteResFpuPair<WriteVarBlend,    [JFPU01, JVALU], 2, [4, 4], 3>;
72606f32e7eSjoergdefm : X86WriteResPairUnsupported<WriteVarBlendY>;
72706f32e7eSjoergdefm : X86WriteResPairUnsupported<WriteVarBlendZ>;
72806f32e7eSjoergdefm : JWriteResFpuPair<WriteVecLogic,    [JFPU01, JVALU], 1>;
72906f32e7eSjoergdefm : JWriteResFpuPair<WriteVecLogicX,   [JFPU01, JVALU], 1>;
73006f32e7eSjoergdefm : X86WriteResPairUnsupported<WriteVecLogicY>;
73106f32e7eSjoergdefm : X86WriteResPairUnsupported<WriteVecLogicZ>;
73206f32e7eSjoergdefm : JWriteResFpuPair<WriteVecTest,     [JFPU0, JFPA, JALU0], 3>;
73306f32e7eSjoergdefm : JWriteResYMMPair<WriteVecTestY,    [JFPU01, JFPX, JFPA, JALU0], 4, [2, 2, 2, 1], 3>;
73406f32e7eSjoergdefm : X86WriteResPairUnsupported<WriteVecTestZ>;
73506f32e7eSjoergdefm : X86WriteResPairUnsupported<WriteShuffle256>;
736*da58b97aSjoergdefm : X86WriteResPairUnsupported<WriteVPMOV256>;
73706f32e7eSjoergdefm : X86WriteResPairUnsupported<WriteVarShuffle256>;
73806f32e7eSjoerg
73906f32e7eSjoerg////////////////////////////////////////////////////////////////////////////////
74006f32e7eSjoerg// Vector insert/extract operations.
74106f32e7eSjoerg////////////////////////////////////////////////////////////////////////////////
74206f32e7eSjoerg
74306f32e7eSjoergdefm : X86WriteRes<WriteVecInsert,      [JFPU01, JVALU], 1, [1,1], 2>;
74406f32e7eSjoergdefm : X86WriteRes<WriteVecInsertLd,    [JFPU01, JVALU, JLAGU], 4, [1,1,1], 1>;
74506f32e7eSjoergdefm : X86WriteRes<WriteVecExtract,     [JFPU0, JFPA, JALU0], 3, [1,1,1], 1>;
74606f32e7eSjoergdefm : X86WriteRes<WriteVecExtractSt,   [JFPU1, JSTC, JSAGU], 3, [1,1,1], 1>;
74706f32e7eSjoerg
74806f32e7eSjoerg////////////////////////////////////////////////////////////////////////////////
74906f32e7eSjoerg// SSE42 String instructions.
75006f32e7eSjoerg////////////////////////////////////////////////////////////////////////////////
75106f32e7eSjoerg
75206f32e7eSjoergdefm : JWriteResFpuPair<WritePCmpIStrI, [JFPU1, JVALU1, JFPU0, JFPA, JALU0], 7, [2, 2, 1, 1, 1], 3>;
75306f32e7eSjoergdefm : JWriteResFpuPair<WritePCmpIStrM, [JFPU1, JVALU1, JFPU0, JFPA, JALU0], 8, [2, 2, 1, 1, 1], 3>;
75406f32e7eSjoergdefm : JWriteResFpuPair<WritePCmpEStrI, [JFPU1, JSAGU, JLAGU, JVALU, JVALU1, JFPA, JALU0], 14, [1, 2, 2, 6, 4, 1, 1], 9>;
75506f32e7eSjoergdefm : JWriteResFpuPair<WritePCmpEStrM, [JFPU1, JSAGU, JLAGU, JVALU, JVALU1, JFPA, JALU0], 14, [1, 2, 2, 6, 4, 1, 1], 9>;
75606f32e7eSjoerg
75706f32e7eSjoerg////////////////////////////////////////////////////////////////////////////////
75806f32e7eSjoerg// MOVMSK Instructions.
75906f32e7eSjoerg////////////////////////////////////////////////////////////////////////////////
76006f32e7eSjoerg
76106f32e7eSjoergdef  : WriteRes<WriteFMOVMSK,    [JFPU0, JFPA, JALU0]> { let Latency = 3; }
76206f32e7eSjoergdef  : WriteRes<WriteVecMOVMSK,  [JFPU0, JFPA, JALU0]> { let Latency = 3; }
76306f32e7eSjoergdefm : X86WriteResUnsupported<WriteVecMOVMSKY>;
76406f32e7eSjoergdef  : WriteRes<WriteMMXMOVMSK,  [JFPU0, JFPA, JALU0]> { let Latency = 3; }
76506f32e7eSjoerg
76606f32e7eSjoerg////////////////////////////////////////////////////////////////////////////////
76706f32e7eSjoerg// AES Instructions.
76806f32e7eSjoerg////////////////////////////////////////////////////////////////////////////////
76906f32e7eSjoerg
77006f32e7eSjoergdefm : JWriteResFpuPair<WriteAESIMC,      [JFPU0, JVIMUL], 2>;
77106f32e7eSjoergdefm : JWriteResFpuPair<WriteAESKeyGen,   [JFPU0, JVIMUL], 2>;
77206f32e7eSjoergdefm : JWriteResFpuPair<WriteAESDecEnc,   [JFPU01, JVALU, JFPU0, JVIMUL], 3, [1,1,1,1], 2>;
77306f32e7eSjoerg
77406f32e7eSjoerg////////////////////////////////////////////////////////////////////////////////
77506f32e7eSjoerg// Horizontal add/sub  instructions.
77606f32e7eSjoerg////////////////////////////////////////////////////////////////////////////////
77706f32e7eSjoerg
77806f32e7eSjoergdefm : JWriteResFpuPair<WriteFHAdd,         [JFPU0, JFPA], 4>;            // +1cy latency.
77906f32e7eSjoergdefm : JWriteResYMMPair<WriteFHAddY,        [JFPU0, JFPA], 4, [2,2], 2>;  // +1cy latency.
78006f32e7eSjoergdefm : JWriteResFpuPair<WritePHAdd,         [JFPU01, JVALU], 1>;
78106f32e7eSjoergdefm : JWriteResFpuPair<WritePHAddX,        [JFPU01, JVALU], 2>;          // +1cy latency.
78206f32e7eSjoergdefm : X86WriteResPairUnsupported<WritePHAddY>;
78306f32e7eSjoerg
78406f32e7eSjoerg////////////////////////////////////////////////////////////////////////////////
78506f32e7eSjoerg// Carry-less multiplication instructions.
78606f32e7eSjoerg////////////////////////////////////////////////////////////////////////////////
78706f32e7eSjoerg
78806f32e7eSjoergdefm : JWriteResFpuPair<WriteCLMul,       [JFPU0, JVIMUL], 2>;
78906f32e7eSjoerg
79006f32e7eSjoerg////////////////////////////////////////////////////////////////////////////////
79106f32e7eSjoerg// SSE4A instructions.
79206f32e7eSjoerg////////////////////////////////////////////////////////////////////////////////
79306f32e7eSjoerg
79406f32e7eSjoergdef JWriteINSERTQ: SchedWriteRes<[JFPU01, JVALU]> {
79506f32e7eSjoerg  let Latency = 2;
79606f32e7eSjoerg  let ResourceCycles = [1, 4];
79706f32e7eSjoerg}
79806f32e7eSjoergdef : InstRW<[JWriteINSERTQ], (instrs INSERTQ, INSERTQI)>;
79906f32e7eSjoerg
80006f32e7eSjoerg////////////////////////////////////////////////////////////////////////////////
80106f32e7eSjoerg// AVX instructions.
80206f32e7eSjoerg////////////////////////////////////////////////////////////////////////////////
80306f32e7eSjoerg
80406f32e7eSjoergdef JWriteVecExtractF128: SchedWriteRes<[JFPU01, JFPX]>;
80506f32e7eSjoergdef : InstRW<[JWriteVecExtractF128], (instrs VEXTRACTF128rr)>;
80606f32e7eSjoerg
80706f32e7eSjoergdef JWriteVBROADCASTYLd: SchedWriteRes<[JLAGU, JFPU01, JFPX]> {
80806f32e7eSjoerg  let Latency = 6;
80906f32e7eSjoerg  let ResourceCycles = [1, 2, 4];
81006f32e7eSjoerg  let NumMicroOps = 2;
81106f32e7eSjoerg}
81206f32e7eSjoergdef : InstRW<[JWriteVBROADCASTYLd], (instrs VBROADCASTSDYrm,
81306f32e7eSjoerg                                            VBROADCASTSSYrm,
81406f32e7eSjoerg                                            VBROADCASTF128)>;
81506f32e7eSjoerg
81606f32e7eSjoergdef JWriteJVZEROALL: SchedWriteRes<[]> {
81706f32e7eSjoerg  let Latency = 90;
81806f32e7eSjoerg  let NumMicroOps = 73;
81906f32e7eSjoerg}
82006f32e7eSjoergdef : InstRW<[JWriteJVZEROALL], (instrs VZEROALL)>;
82106f32e7eSjoerg
82206f32e7eSjoergdef JWriteJVZEROUPPER: SchedWriteRes<[]> {
82306f32e7eSjoerg  let Latency = 46;
82406f32e7eSjoerg  let NumMicroOps = 37;
82506f32e7eSjoerg}
82606f32e7eSjoergdef : InstRW<[JWriteJVZEROUPPER], (instrs VZEROUPPER)>;
82706f32e7eSjoerg
82806f32e7eSjoerg///////////////////////////////////////////////////////////////////////////////
82906f32e7eSjoerg//  SSE2/AVX Store Selected Bytes of Double Quadword - (V)MASKMOVDQ
83006f32e7eSjoerg///////////////////////////////////////////////////////////////////////////////
83106f32e7eSjoerg
83206f32e7eSjoergdef JWriteMASKMOVDQU: SchedWriteRes<[JFPU0, JFPA, JFPU1, JSTC, JLAGU, JSAGU, JALU01]> {
83306f32e7eSjoerg  let Latency = 34;
83406f32e7eSjoerg  let ResourceCycles = [1, 1, 2, 2, 2, 16, 42];
83506f32e7eSjoerg  let NumMicroOps = 63;
83606f32e7eSjoerg}
83706f32e7eSjoergdef : InstRW<[JWriteMASKMOVDQU], (instrs MASKMOVDQU, MASKMOVDQU64,
83806f32e7eSjoerg                                         VMASKMOVDQU, VMASKMOVDQU64)>;
83906f32e7eSjoerg
84006f32e7eSjoerg///////////////////////////////////////////////////////////////////////////////
84106f32e7eSjoerg//  SchedWriteVariant definitions.
84206f32e7eSjoerg///////////////////////////////////////////////////////////////////////////////
84306f32e7eSjoerg
84406f32e7eSjoergdef JWriteZeroLatency : SchedWriteRes<[]> {
84506f32e7eSjoerg  let Latency = 0;
84606f32e7eSjoerg}
84706f32e7eSjoerg
84806f32e7eSjoergdef JWriteZeroIdiomYmm : SchedWriteRes<[JFPU01, JFPX]> {
84906f32e7eSjoerg  let NumMicroOps = 2;
85006f32e7eSjoerg}
85106f32e7eSjoerg
85206f32e7eSjoerg// Certain instructions that use the same register for both source
85306f32e7eSjoerg// operands do not have a real dependency on the previous contents of the
85406f32e7eSjoerg// register, and thus, do not have to wait before completing. They can be
85506f32e7eSjoerg// optimized out at register renaming stage.
85606f32e7eSjoerg// Reference: Section 10.8 of the "Software Optimization Guide for AMD Family
85706f32e7eSjoerg// 15h Processors".
85806f32e7eSjoerg// Reference: Agner's Fog "The microarchitecture of Intel, AMD and VIA CPUs",
85906f32e7eSjoerg// Section 21.8 [Dependency-breaking instructions].
86006f32e7eSjoerg
86106f32e7eSjoergdef JWriteZeroIdiom : SchedWriteVariant<[
86206f32e7eSjoerg    SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [JWriteZeroLatency]>,
86306f32e7eSjoerg    SchedVar<NoSchedPred,                          [WriteALU]>
86406f32e7eSjoerg]>;
86506f32e7eSjoergdef : InstRW<[JWriteZeroIdiom], (instrs SUB32rr, SUB64rr,
86606f32e7eSjoerg                                        XOR32rr, XOR64rr)>;
86706f32e7eSjoerg
86806f32e7eSjoergdef JWriteFZeroIdiom : SchedWriteVariant<[
86906f32e7eSjoerg    SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [JWriteZeroLatency]>,
87006f32e7eSjoerg    SchedVar<NoSchedPred,                          [WriteFLogic]>
87106f32e7eSjoerg]>;
87206f32e7eSjoergdef : InstRW<[JWriteFZeroIdiom], (instrs XORPSrr, VXORPSrr, XORPDrr, VXORPDrr,
87306f32e7eSjoerg                                         ANDNPSrr, VANDNPSrr,
87406f32e7eSjoerg                                         ANDNPDrr, VANDNPDrr)>;
87506f32e7eSjoerg
87606f32e7eSjoergdef JWriteFZeroIdiomY : SchedWriteVariant<[
87706f32e7eSjoerg    SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [JWriteZeroIdiomYmm]>,
87806f32e7eSjoerg    SchedVar<NoSchedPred,                          [WriteFLogicY]>
87906f32e7eSjoerg]>;
88006f32e7eSjoergdef : InstRW<[JWriteFZeroIdiomY], (instrs VXORPSYrr, VXORPDYrr,
88106f32e7eSjoerg                                          VANDNPSYrr, VANDNPDYrr)>;
88206f32e7eSjoerg
88306f32e7eSjoergdef JWriteVZeroIdiomLogic : SchedWriteVariant<[
88406f32e7eSjoerg    SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [JWriteZeroLatency]>,
88506f32e7eSjoerg    SchedVar<NoSchedPred,                          [WriteVecLogic]>
88606f32e7eSjoerg]>;
88706f32e7eSjoergdef : InstRW<[JWriteVZeroIdiomLogic], (instrs MMX_PXORirr, MMX_PANDNirr)>;
88806f32e7eSjoerg
88906f32e7eSjoergdef JWriteVZeroIdiomLogicX : SchedWriteVariant<[
89006f32e7eSjoerg    SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [JWriteZeroLatency]>,
89106f32e7eSjoerg    SchedVar<NoSchedPred,                          [WriteVecLogicX]>
89206f32e7eSjoerg]>;
89306f32e7eSjoergdef : InstRW<[JWriteVZeroIdiomLogicX], (instrs PXORrr, VPXORrr,
89406f32e7eSjoerg                                               PANDNrr, VPANDNrr)>;
89506f32e7eSjoerg
89606f32e7eSjoergdef JWriteVZeroIdiomALU : SchedWriteVariant<[
89706f32e7eSjoerg    SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [JWriteZeroLatency]>,
89806f32e7eSjoerg    SchedVar<NoSchedPred,                          [WriteVecALU]>
89906f32e7eSjoerg]>;
90006f32e7eSjoergdef : InstRW<[JWriteVZeroIdiomALU], (instrs MMX_PSUBBirr, MMX_PSUBDirr,
90106f32e7eSjoerg                                            MMX_PSUBQirr, MMX_PSUBWirr,
90206f32e7eSjoerg                                            MMX_PSUBSBirr, MMX_PSUBSWirr,
90306f32e7eSjoerg                                            MMX_PSUBUSBirr, MMX_PSUBUSWirr,
90406f32e7eSjoerg                                            MMX_PCMPGTBirr, MMX_PCMPGTDirr,
90506f32e7eSjoerg                                            MMX_PCMPGTWirr)>;
90606f32e7eSjoerg
90706f32e7eSjoergdef JWriteVZeroIdiomALUX : SchedWriteVariant<[
90806f32e7eSjoerg    SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [JWriteZeroLatency]>,
90906f32e7eSjoerg    SchedVar<NoSchedPred,                          [WriteVecALUX]>
91006f32e7eSjoerg]>;
91106f32e7eSjoergdef : InstRW<[JWriteVZeroIdiomALUX], (instrs PSUBBrr, VPSUBBrr,
91206f32e7eSjoerg                                             PSUBDrr, VPSUBDrr,
91306f32e7eSjoerg                                             PSUBQrr, VPSUBQrr,
91406f32e7eSjoerg                                             PSUBWrr, VPSUBWrr,
91506f32e7eSjoerg                                             PSUBSBrr, VPSUBSBrr,
91606f32e7eSjoerg                                             PSUBSWrr, VPSUBSWrr,
91706f32e7eSjoerg                                             PSUBUSBrr, VPSUBUSBrr,
91806f32e7eSjoerg                                             PSUBUSWrr, VPSUBUSWrr,
91906f32e7eSjoerg                                             PCMPGTBrr, VPCMPGTBrr,
92006f32e7eSjoerg                                             PCMPGTDrr, VPCMPGTDrr,
92106f32e7eSjoerg                                             PCMPGTQrr, VPCMPGTQrr,
92206f32e7eSjoerg                                             PCMPGTWrr, VPCMPGTWrr)>;
92306f32e7eSjoerg
92406f32e7eSjoergdef JWriteVPERM2F128 : SchedWriteVariant<[
92506f32e7eSjoerg  SchedVar<MCSchedPredicate<ZeroIdiomVPERMPredicate>, [JWriteZeroIdiomYmm]>,
92606f32e7eSjoerg  SchedVar<NoSchedPred,                               [WriteFShuffle256]>
92706f32e7eSjoerg]>;
92806f32e7eSjoergdef : InstRW<[JWriteVPERM2F128], (instrs VPERM2F128rr)>;
92906f32e7eSjoerg
93006f32e7eSjoerg// This write is used for slow LEA instructions.
93106f32e7eSjoergdef JWrite3OpsLEA : SchedWriteRes<[JALU1, JSAGU]> {
93206f32e7eSjoerg  let Latency = 2;
93306f32e7eSjoerg}
93406f32e7eSjoerg
93506f32e7eSjoerg// On Jaguar, a slow LEA is either a 3Ops LEA (base, index, offset), or an LEA
93606f32e7eSjoerg// with a `Scale` value different than 1.
93706f32e7eSjoergdef JSlowLEAPredicate : MCSchedPredicate<
93806f32e7eSjoerg  CheckAny<[
93906f32e7eSjoerg    // A 3-operand LEA (base, index, offset).
94006f32e7eSjoerg    IsThreeOperandsLEAFn,
94106f32e7eSjoerg    // An LEA with a "Scale" different than 1.
94206f32e7eSjoerg    CheckAll<[
94306f32e7eSjoerg      CheckIsImmOperand<2>,
94406f32e7eSjoerg      CheckNot<CheckImmOperand<2, 1>>
94506f32e7eSjoerg    ]>
94606f32e7eSjoerg  ]>
94706f32e7eSjoerg>;
94806f32e7eSjoerg
94906f32e7eSjoergdef JWriteLEA : SchedWriteVariant<[
95006f32e7eSjoerg    SchedVar<JSlowLEAPredicate, [JWrite3OpsLEA]>,
95106f32e7eSjoerg    SchedVar<NoSchedPred,       [WriteLEA]>
95206f32e7eSjoerg]>;
95306f32e7eSjoerg
95406f32e7eSjoergdef : InstRW<[JWriteLEA], (instrs LEA32r, LEA64r, LEA64_32r)>;
95506f32e7eSjoerg
95606f32e7eSjoergdef JSlowLEA16r : SchedWriteRes<[JALU01]> {
95706f32e7eSjoerg  let Latency = 3;
95806f32e7eSjoerg  let ResourceCycles = [4];
95906f32e7eSjoerg}
96006f32e7eSjoerg
96106f32e7eSjoergdef : InstRW<[JSlowLEA16r], (instrs LEA16r)>;
96206f32e7eSjoerg
96306f32e7eSjoerg///////////////////////////////////////////////////////////////////////////////
96406f32e7eSjoerg// Dependency breaking instructions.
96506f32e7eSjoerg///////////////////////////////////////////////////////////////////////////////
96606f32e7eSjoerg
96706f32e7eSjoergdef : IsZeroIdiomFunction<[
96806f32e7eSjoerg  // GPR Zero-idioms.
96906f32e7eSjoerg  DepBreakingClass<[ SUB32rr, SUB64rr, XOR32rr, XOR64rr ], ZeroIdiomPredicate>,
97006f32e7eSjoerg
97106f32e7eSjoerg  // MMX Zero-idioms.
97206f32e7eSjoerg  DepBreakingClass<[
97306f32e7eSjoerg    MMX_PXORirr, MMX_PANDNirr, MMX_PSUBBirr,
97406f32e7eSjoerg    MMX_PSUBDirr, MMX_PSUBQirr, MMX_PSUBWirr,
97506f32e7eSjoerg    MMX_PSUBSBirr, MMX_PSUBSWirr, MMX_PSUBUSBirr, MMX_PSUBUSWirr,
97606f32e7eSjoerg    MMX_PCMPGTBirr, MMX_PCMPGTDirr, MMX_PCMPGTWirr
97706f32e7eSjoerg  ], ZeroIdiomPredicate>,
97806f32e7eSjoerg
97906f32e7eSjoerg  // SSE Zero-idioms.
98006f32e7eSjoerg  DepBreakingClass<[
98106f32e7eSjoerg    // fp variants.
98206f32e7eSjoerg    XORPSrr, XORPDrr, ANDNPSrr, ANDNPDrr,
98306f32e7eSjoerg
98406f32e7eSjoerg    // int variants.
98506f32e7eSjoerg    PXORrr, PANDNrr,
98606f32e7eSjoerg    PSUBBrr, PSUBWrr, PSUBDrr, PSUBQrr,
98706f32e7eSjoerg    PSUBSBrr, PSUBSWrr, PSUBUSBrr, PSUBUSWrr,
98806f32e7eSjoerg    PCMPGTBrr, PCMPGTDrr, PCMPGTQrr, PCMPGTWrr
98906f32e7eSjoerg  ], ZeroIdiomPredicate>,
99006f32e7eSjoerg
99106f32e7eSjoerg  // AVX Zero-idioms.
99206f32e7eSjoerg  DepBreakingClass<[
99306f32e7eSjoerg    // xmm fp variants.
99406f32e7eSjoerg    VXORPSrr, VXORPDrr, VANDNPSrr, VANDNPDrr,
99506f32e7eSjoerg
99606f32e7eSjoerg    // xmm int variants.
99706f32e7eSjoerg    VPXORrr, VPANDNrr,
99806f32e7eSjoerg    VPSUBBrr, VPSUBWrr, VPSUBDrr, VPSUBQrr,
99906f32e7eSjoerg    VPSUBSBrr, VPSUBSWrr, VPSUBUSBrr, VPSUBUSWrr,
100006f32e7eSjoerg    VPCMPGTBrr, VPCMPGTWrr, VPCMPGTDrr, VPCMPGTQrr,
100106f32e7eSjoerg
100206f32e7eSjoerg    // ymm variants.
100306f32e7eSjoerg    VXORPSYrr, VXORPDYrr, VANDNPSYrr, VANDNPDYrr
100406f32e7eSjoerg  ], ZeroIdiomPredicate>,
100506f32e7eSjoerg
100606f32e7eSjoerg  DepBreakingClass<[ VPERM2F128rr ], ZeroIdiomVPERMPredicate>
100706f32e7eSjoerg]>;
100806f32e7eSjoerg
100906f32e7eSjoergdef : IsDepBreakingFunction<[
101006f32e7eSjoerg  // GPR
101106f32e7eSjoerg  DepBreakingClass<[ SBB32rr, SBB64rr ], ZeroIdiomPredicate>,
101206f32e7eSjoerg  DepBreakingClass<[ CMP32rr, CMP64rr ], CheckSameRegOperand<0, 1> >,
101306f32e7eSjoerg
101406f32e7eSjoerg  // MMX
101506f32e7eSjoerg  DepBreakingClass<[
101606f32e7eSjoerg    MMX_PCMPEQBirr, MMX_PCMPEQDirr, MMX_PCMPEQWirr
101706f32e7eSjoerg  ], ZeroIdiomPredicate>,
101806f32e7eSjoerg
101906f32e7eSjoerg  // SSE
102006f32e7eSjoerg  DepBreakingClass<[
102106f32e7eSjoerg    PCMPEQBrr, PCMPEQWrr, PCMPEQDrr, PCMPEQQrr
102206f32e7eSjoerg  ], ZeroIdiomPredicate>,
102306f32e7eSjoerg
102406f32e7eSjoerg  // AVX
102506f32e7eSjoerg  DepBreakingClass<[
102606f32e7eSjoerg    VPCMPEQBrr, VPCMPEQWrr, VPCMPEQDrr, VPCMPEQQrr
102706f32e7eSjoerg  ], ZeroIdiomPredicate>
102806f32e7eSjoerg]>;
102906f32e7eSjoerg
103006f32e7eSjoergdef : IsOptimizableRegisterMove<[
103106f32e7eSjoerg  InstructionEquivalenceClass<[
103206f32e7eSjoerg    // GPR variants.
103306f32e7eSjoerg    MOV32rr, MOV64rr,
103406f32e7eSjoerg
103506f32e7eSjoerg    // MMX variants.
103606f32e7eSjoerg    MMX_MOVQ64rr,
103706f32e7eSjoerg
103806f32e7eSjoerg    // SSE variants.
103906f32e7eSjoerg    MOVAPSrr, MOVUPSrr,
104006f32e7eSjoerg    MOVAPDrr, MOVUPDrr,
104106f32e7eSjoerg    MOVDQArr, MOVDQUrr,
104206f32e7eSjoerg
104306f32e7eSjoerg    // AVX variants.
104406f32e7eSjoerg    VMOVAPSrr, VMOVUPSrr,
104506f32e7eSjoerg    VMOVAPDrr, VMOVUPDrr,
104606f32e7eSjoerg    VMOVDQArr, VMOVDQUrr
104706f32e7eSjoerg  ], TruePred >
104806f32e7eSjoerg]>;
104906f32e7eSjoerg
105006f32e7eSjoerg} // SchedModel
1051