106f32e7eSjoerg//=- X86ScheduleBtVer2.td - X86 BtVer2 (Jaguar) Scheduling ---*- tablegen -*-=// 206f32e7eSjoerg// 306f32e7eSjoerg// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 406f32e7eSjoerg// See https://llvm.org/LICENSE.txt for license information. 506f32e7eSjoerg// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 606f32e7eSjoerg// 706f32e7eSjoerg//===----------------------------------------------------------------------===// 806f32e7eSjoerg// 906f32e7eSjoerg// This file defines the machine model for AMD btver2 (Jaguar) to support 1006f32e7eSjoerg// instruction scheduling and other instruction cost heuristics. Based off AMD Software 1106f32e7eSjoerg// Optimization Guide for AMD Family 16h Processors & Instruction Latency appendix. 1206f32e7eSjoerg// 1306f32e7eSjoerg//===----------------------------------------------------------------------===// 1406f32e7eSjoerg 1506f32e7eSjoergdef BtVer2Model : SchedMachineModel { 1606f32e7eSjoerg // All x86 instructions are modeled as a single micro-op, and btver2 can 1706f32e7eSjoerg // decode 2 instructions per cycle. 1806f32e7eSjoerg let IssueWidth = 2; 1906f32e7eSjoerg let MicroOpBufferSize = 64; // Retire Control Unit 2006f32e7eSjoerg let LoadLatency = 5; // FPU latency (worse case cf Integer 3 cycle latency) 2106f32e7eSjoerg let HighLatency = 25; 2206f32e7eSjoerg let MispredictPenalty = 14; // Minimum branch misdirection penalty 2306f32e7eSjoerg let PostRAScheduler = 1; 2406f32e7eSjoerg 2506f32e7eSjoerg // FIXME: SSE4/AVX is unimplemented. This flag is set to allow 2606f32e7eSjoerg // the scheduler to assign a default model to unrecognized opcodes. 2706f32e7eSjoerg let CompleteModel = 0; 2806f32e7eSjoerg} 2906f32e7eSjoerg 3006f32e7eSjoerglet SchedModel = BtVer2Model in { 3106f32e7eSjoerg 3206f32e7eSjoerg// Jaguar can issue up to 6 micro-ops in one cycle 3306f32e7eSjoergdef JALU0 : ProcResource<1>; // Integer Pipe0: integer ALU0 (also handle FP->INT jam) 3406f32e7eSjoergdef JALU1 : ProcResource<1>; // Integer Pipe1: integer ALU1/MUL/DIV 3506f32e7eSjoergdef JLAGU : ProcResource<1>; // Integer Pipe2: LAGU 3606f32e7eSjoergdef JSAGU : ProcResource<1>; // Integer Pipe3: SAGU (also handles 3-operand LEA) 3706f32e7eSjoergdef JFPU0 : ProcResource<1>; // Vector/FPU Pipe0: VALU0/VIMUL/FPA 3806f32e7eSjoergdef JFPU1 : ProcResource<1>; // Vector/FPU Pipe1: VALU1/STC/FPM 3906f32e7eSjoerg 4006f32e7eSjoerg// The Integer PRF for Jaguar is 64 entries, and it holds the architectural and 4106f32e7eSjoerg// speculative version of the 64-bit integer registers. 4206f32e7eSjoerg// Reference: www.realworldtech.com/jaguar/4/ 4306f32e7eSjoerg// 4406f32e7eSjoerg// The processor always keeps the different parts of an integer register 4506f32e7eSjoerg// together. An instruction that writes to a part of a register will therefore 4606f32e7eSjoerg// have a false dependence on any previous write to the same register or any 4706f32e7eSjoerg// part of it. 4806f32e7eSjoerg// Reference: Section 21.10 "AMD Bobcat and Jaguar pipeline: Partial register 4906f32e7eSjoerg// access" - Agner Fog's "microarchitecture.pdf". 5006f32e7eSjoergdef JIntegerPRF : RegisterFile<64, [GR64, CCR], [1, 1], [1, 0], 5106f32e7eSjoerg 0, // Max moves that can be eliminated per cycle. 5206f32e7eSjoerg 1>; // Restrict move elimination to zero regs. 5306f32e7eSjoerg 5406f32e7eSjoerg// The Jaguar FP Retire Queue renames SIMD and FP uOps onto a pool of 72 SSE 5506f32e7eSjoerg// registers. Operations on 256-bit data types are cracked into two COPs. 5606f32e7eSjoerg// Reference: www.realworldtech.com/jaguar/4/ 5706f32e7eSjoerg 5806f32e7eSjoerg// The PRF in the floating point unit can eliminate a move from a MMX or SSE 5906f32e7eSjoerg// register that is know to be zero (i.e. it has been zeroed using a zero-idiom 6006f32e7eSjoerg// dependency breaking instruction, or via VZEROALL). 6106f32e7eSjoerg// Reference: Section 21.8 "AMD Bobcat and Jaguar pipeline: Dependency-breaking 6206f32e7eSjoerg// instructions" - Agner Fog's "microarchitecture.pdf" 6306f32e7eSjoergdef JFpuPRF: RegisterFile<72, [VR64, VR128, VR256], [1, 1, 2], [1, 1, 0], 6406f32e7eSjoerg 0, // Max moves that can be eliminated per cycle. 6506f32e7eSjoerg 1>; // Restrict move elimination to zero regs. 6606f32e7eSjoerg 6706f32e7eSjoerg// The retire control unit (RCU) can track up to 64 macro-ops in-flight. It can 6806f32e7eSjoerg// retire up to two macro-ops per cycle. 6906f32e7eSjoerg// Reference: "Software Optimization Guide for AMD Family 16h Processors" 7006f32e7eSjoergdef JRCU : RetireControlUnit<64, 2>; 7106f32e7eSjoerg 7206f32e7eSjoerg// Integer Pipe Scheduler 7306f32e7eSjoergdef JALU01 : ProcResGroup<[JALU0, JALU1]> { 7406f32e7eSjoerg let BufferSize=20; 7506f32e7eSjoerg} 7606f32e7eSjoerg 7706f32e7eSjoerg// AGU Pipe Scheduler 7806f32e7eSjoergdef JLSAGU : ProcResGroup<[JLAGU, JSAGU]> { 7906f32e7eSjoerg let BufferSize=12; 8006f32e7eSjoerg} 8106f32e7eSjoerg 8206f32e7eSjoerg// Fpu Pipe Scheduler 8306f32e7eSjoergdef JFPU01 : ProcResGroup<[JFPU0, JFPU1]> { 8406f32e7eSjoerg let BufferSize=18; 8506f32e7eSjoerg} 8606f32e7eSjoerg 8706f32e7eSjoerg// Functional units 8806f32e7eSjoergdef JDiv : ProcResource<1>; // integer division 8906f32e7eSjoergdef JMul : ProcResource<1>; // integer multiplication 9006f32e7eSjoergdef JVALU0 : ProcResource<1>; // vector integer 9106f32e7eSjoergdef JVALU1 : ProcResource<1>; // vector integer 9206f32e7eSjoergdef JVIMUL : ProcResource<1>; // vector integer multiplication 9306f32e7eSjoergdef JSTC : ProcResource<1>; // vector store/convert 9406f32e7eSjoergdef JFPM : ProcResource<1>; // FP multiplication 9506f32e7eSjoergdef JFPA : ProcResource<1>; // FP addition 9606f32e7eSjoerg 9706f32e7eSjoerg// Functional unit groups 9806f32e7eSjoergdef JFPX : ProcResGroup<[JFPA, JFPM]>; 9906f32e7eSjoergdef JVALU : ProcResGroup<[JVALU0, JVALU1]>; 10006f32e7eSjoerg 10106f32e7eSjoerg// Integer loads are 3 cycles, so ReadAfterLd registers needn't be available until 3 10206f32e7eSjoerg// cycles after the memory operand. 10306f32e7eSjoergdef : ReadAdvance<ReadAfterLd, 3>; 10406f32e7eSjoerg 10506f32e7eSjoerg// Vector loads are 5 cycles, so ReadAfterVec*Ld registers needn't be available until 5 10606f32e7eSjoerg// cycles after the memory operand. 10706f32e7eSjoergdef : ReadAdvance<ReadAfterVecLd, 5>; 10806f32e7eSjoergdef : ReadAdvance<ReadAfterVecXLd, 5>; 10906f32e7eSjoergdef : ReadAdvance<ReadAfterVecYLd, 5>; 11006f32e7eSjoerg 11106f32e7eSjoerg/// "Additional 6 cycle transfer operation which moves a floating point 11206f32e7eSjoerg/// operation input value from the integer unit to the floating point unit. 11306f32e7eSjoerg/// Reference: AMDfam16h SOG (Appendix A "Instruction Latencies", Section A.2). 11406f32e7eSjoergdef : ReadAdvance<ReadInt2Fpu, -6>; 11506f32e7eSjoerg 11606f32e7eSjoerg// Many SchedWrites are defined in pairs with and without a folded load. 11706f32e7eSjoerg// Instructions with folded loads are usually micro-fused, so they only appear 11806f32e7eSjoerg// as two micro-ops when dispatched by the schedulers. 11906f32e7eSjoerg// This multiclass defines the resource usage for variants with and without 12006f32e7eSjoerg// folded loads. 12106f32e7eSjoergmulticlass JWriteResIntPair<X86FoldableSchedWrite SchedRW, 12206f32e7eSjoerg list<ProcResourceKind> ExePorts, 12306f32e7eSjoerg int Lat, list<int> Res = [], int UOps = 1, 12406f32e7eSjoerg int LoadUOps = 0> { 12506f32e7eSjoerg // Register variant is using a single cycle on ExePort. 12606f32e7eSjoerg def : WriteRes<SchedRW, ExePorts> { 12706f32e7eSjoerg let Latency = Lat; 12806f32e7eSjoerg let ResourceCycles = Res; 12906f32e7eSjoerg let NumMicroOps = UOps; 13006f32e7eSjoerg } 13106f32e7eSjoerg 13206f32e7eSjoerg // Memory variant also uses a cycle on JLAGU and adds 3 cycles to the 13306f32e7eSjoerg // latency. 13406f32e7eSjoerg def : WriteRes<SchedRW.Folded, !listconcat([JLAGU], ExePorts)> { 13506f32e7eSjoerg let Latency = !add(Lat, 3); 13606f32e7eSjoerg let ResourceCycles = !if(!empty(Res), [], !listconcat([1], Res)); 13706f32e7eSjoerg let NumMicroOps = !add(UOps, LoadUOps); 13806f32e7eSjoerg } 13906f32e7eSjoerg} 14006f32e7eSjoerg 14106f32e7eSjoergmulticlass JWriteResFpuPair<X86FoldableSchedWrite SchedRW, 14206f32e7eSjoerg list<ProcResourceKind> ExePorts, 14306f32e7eSjoerg int Lat, list<int> Res = [], int UOps = 1, 14406f32e7eSjoerg int LoadUOps = 0> { 14506f32e7eSjoerg // Register variant is using a single cycle on ExePort. 14606f32e7eSjoerg def : WriteRes<SchedRW, ExePorts> { 14706f32e7eSjoerg let Latency = Lat; 14806f32e7eSjoerg let ResourceCycles = Res; 14906f32e7eSjoerg let NumMicroOps = UOps; 15006f32e7eSjoerg } 15106f32e7eSjoerg 15206f32e7eSjoerg // Memory variant also uses a cycle on JLAGU and adds 5 cycles to the 15306f32e7eSjoerg // latency. 15406f32e7eSjoerg def : WriteRes<SchedRW.Folded, !listconcat([JLAGU], ExePorts)> { 15506f32e7eSjoerg let Latency = !add(Lat, 5); 15606f32e7eSjoerg let ResourceCycles = !if(!empty(Res), [], !listconcat([1], Res)); 15706f32e7eSjoerg let NumMicroOps = !add(UOps, LoadUOps); 15806f32e7eSjoerg } 15906f32e7eSjoerg} 16006f32e7eSjoerg 16106f32e7eSjoergmulticlass JWriteResYMMPair<X86FoldableSchedWrite SchedRW, 16206f32e7eSjoerg list<ProcResourceKind> ExePorts, 16306f32e7eSjoerg int Lat, list<int> Res = [2], int UOps = 2, 16406f32e7eSjoerg int LoadUOps = 0> { 16506f32e7eSjoerg // Register variant is using a single cycle on ExePort. 16606f32e7eSjoerg def : WriteRes<SchedRW, ExePorts> { 16706f32e7eSjoerg let Latency = Lat; 16806f32e7eSjoerg let ResourceCycles = Res; 16906f32e7eSjoerg let NumMicroOps = UOps; 17006f32e7eSjoerg } 17106f32e7eSjoerg 17206f32e7eSjoerg // Memory variant also uses 2 cycles on JLAGU and adds 5 cycles to the 17306f32e7eSjoerg // latency. 17406f32e7eSjoerg def : WriteRes<SchedRW.Folded, !listconcat([JLAGU], ExePorts)> { 17506f32e7eSjoerg let Latency = !add(Lat, 5); 17606f32e7eSjoerg let ResourceCycles = !listconcat([2], Res); 17706f32e7eSjoerg let NumMicroOps = !add(UOps, LoadUOps); 17806f32e7eSjoerg } 17906f32e7eSjoerg} 18006f32e7eSjoerg 18106f32e7eSjoerg// Instructions that have local forwarding disabled have an extra +1cy latency. 18206f32e7eSjoerg 18306f32e7eSjoerg// A folded store needs a cycle on the SAGU for the store data, most RMW 18406f32e7eSjoerg// instructions don't need an extra uop. ALU RMW operations don't seem to 18506f32e7eSjoerg// benefit from STLF, and their observed latency is 6cy. That is the reason why 18606f32e7eSjoerg// this write adds two extra cycles (instead of just 1cy for the store). 18706f32e7eSjoergdefm : X86WriteRes<WriteRMW, [JSAGU], 2, [1], 0>; 18806f32e7eSjoerg 18906f32e7eSjoerg//////////////////////////////////////////////////////////////////////////////// 19006f32e7eSjoerg// Arithmetic. 19106f32e7eSjoerg//////////////////////////////////////////////////////////////////////////////// 19206f32e7eSjoerg 19306f32e7eSjoergdefm : JWriteResIntPair<WriteALU, [JALU01], 1>; 19406f32e7eSjoergdefm : JWriteResIntPair<WriteADC, [JALU01], 1, [2]>; 19506f32e7eSjoerg 19606f32e7eSjoergdefm : X86WriteRes<WriteBSWAP32, [JALU01], 1, [1], 1>; 19706f32e7eSjoergdefm : X86WriteRes<WriteBSWAP64, [JALU01], 1, [1], 1>; 19806f32e7eSjoergdefm : X86WriteRes<WriteCMPXCHG, [JALU01], 3, [3], 5>; 19906f32e7eSjoergdefm : X86WriteRes<WriteCMPXCHGRMW, [JALU01, JSAGU, JLAGU], 11, [3, 1, 1], 6>; 20006f32e7eSjoergdefm : X86WriteRes<WriteXCHG, [JALU01], 1, [2], 2>; 20106f32e7eSjoerg 20206f32e7eSjoergdefm : JWriteResIntPair<WriteIMul8, [JALU1, JMul], 3, [1, 1], 1>; 20306f32e7eSjoergdefm : JWriteResIntPair<WriteIMul16, [JALU1, JMul], 3, [1, 3], 3>; 20406f32e7eSjoergdefm : JWriteResIntPair<WriteIMul16Imm, [JALU1, JMul], 4, [1, 2], 2>; 20506f32e7eSjoergdefm : JWriteResIntPair<WriteIMul16Reg, [JALU1, JMul], 3, [1, 1], 1>; 20606f32e7eSjoergdefm : JWriteResIntPair<WriteIMul32, [JALU1, JMul], 3, [1, 2], 2>; 20706f32e7eSjoergdefm : JWriteResIntPair<WriteIMul32Imm, [JALU1, JMul], 3, [1, 1], 1>; 20806f32e7eSjoergdefm : JWriteResIntPair<WriteIMul32Reg, [JALU1, JMul], 3, [1, 1], 1>; 20906f32e7eSjoergdefm : JWriteResIntPair<WriteIMul64, [JALU1, JMul], 6, [1, 4], 2>; 21006f32e7eSjoergdefm : JWriteResIntPair<WriteIMul64Imm, [JALU1, JMul], 6, [1, 4], 1>; 21106f32e7eSjoergdefm : JWriteResIntPair<WriteIMul64Reg, [JALU1, JMul], 6, [1, 4], 1>; 21206f32e7eSjoergdefm : X86WriteRes<WriteIMulH, [JALU1], 6, [4], 1>; 21306f32e7eSjoerg 21406f32e7eSjoergdefm : JWriteResIntPair<WriteDiv8, [JALU1, JDiv], 12, [1, 12], 1>; 21506f32e7eSjoergdefm : JWriteResIntPair<WriteDiv16, [JALU1, JDiv], 17, [1, 17], 2>; 21606f32e7eSjoergdefm : JWriteResIntPair<WriteDiv32, [JALU1, JDiv], 25, [1, 25], 2>; 21706f32e7eSjoergdefm : JWriteResIntPair<WriteDiv64, [JALU1, JDiv], 41, [1, 41], 2>; 21806f32e7eSjoergdefm : JWriteResIntPair<WriteIDiv8, [JALU1, JDiv], 12, [1, 12], 1>; 21906f32e7eSjoergdefm : JWriteResIntPair<WriteIDiv16, [JALU1, JDiv], 17, [1, 17], 2>; 22006f32e7eSjoergdefm : JWriteResIntPair<WriteIDiv32, [JALU1, JDiv], 25, [1, 25], 2>; 22106f32e7eSjoergdefm : JWriteResIntPair<WriteIDiv64, [JALU1, JDiv], 41, [1, 41], 2>; 22206f32e7eSjoerg 22306f32e7eSjoergdefm : JWriteResIntPair<WriteCRC32, [JALU01], 3, [4], 3>; 22406f32e7eSjoerg 22506f32e7eSjoergdefm : JWriteResIntPair<WriteCMOV, [JALU01], 1>; // Conditional move. 22606f32e7eSjoergdefm : X86WriteRes<WriteFCMOV, [JFPU0, JFPA], 3, [1,1], 1>; // x87 conditional move. 22706f32e7eSjoergdef : WriteRes<WriteSETCC, [JALU01]>; // Setcc. 22806f32e7eSjoergdef : WriteRes<WriteSETCCStore, [JALU01,JSAGU]>; 22906f32e7eSjoergdef : WriteRes<WriteLAHFSAHF, [JALU01]>; 23006f32e7eSjoerg 23106f32e7eSjoergdefm : X86WriteRes<WriteBitTest, [JALU01], 1, [1], 1>; 23206f32e7eSjoergdefm : X86WriteRes<WriteBitTestImmLd, [JALU01,JLAGU], 4, [1,1], 1>; 23306f32e7eSjoergdefm : X86WriteRes<WriteBitTestRegLd, [JALU01,JLAGU], 4, [1,1], 5>; 23406f32e7eSjoergdefm : X86WriteRes<WriteBitTestSet, [JALU01], 1, [1], 2>; 23506f32e7eSjoergdefm : X86WriteRes<WriteBitTestSetImmLd, [JALU01,JLAGU], 4, [1,1], 4>; 23606f32e7eSjoergdefm : X86WriteRes<WriteBitTestSetRegLd, [JALU01,JLAGU], 4, [1,1], 8>; 23706f32e7eSjoerg 23806f32e7eSjoerg// This is for simple LEAs with one or two input operands. 23906f32e7eSjoergdef : WriteRes<WriteLEA, [JALU01]>; 24006f32e7eSjoerg 24106f32e7eSjoerg// Bit counts. 24206f32e7eSjoergdefm : JWriteResIntPair<WriteBSF, [JALU01], 4, [8], 7>; 24306f32e7eSjoergdefm : JWriteResIntPair<WriteBSR, [JALU01], 5, [8], 8>; 24406f32e7eSjoergdefm : JWriteResIntPair<WritePOPCNT, [JALU01], 1>; 24506f32e7eSjoergdefm : JWriteResIntPair<WriteLZCNT, [JALU01], 1>; 24606f32e7eSjoergdefm : JWriteResIntPair<WriteTZCNT, [JALU01], 2, [2], 2>; 24706f32e7eSjoerg 24806f32e7eSjoerg// BMI1 BEXTR/BLS, BMI2 BZHI 24906f32e7eSjoergdefm : JWriteResIntPair<WriteBEXTR, [JALU01], 1>; 25006f32e7eSjoergdefm : JWriteResIntPair<WriteBLS, [JALU01], 2, [2], 2>; 25106f32e7eSjoergdefm : X86WriteResPairUnsupported<WriteBZHI>; 25206f32e7eSjoerg 25306f32e7eSjoerg//////////////////////////////////////////////////////////////////////////////// 25406f32e7eSjoerg// Integer shifts and rotates. 25506f32e7eSjoerg//////////////////////////////////////////////////////////////////////////////// 25606f32e7eSjoerg 25706f32e7eSjoergdefm : JWriteResIntPair<WriteShift, [JALU01], 1>; 25806f32e7eSjoergdefm : JWriteResIntPair<WriteShiftCL, [JALU01], 1>; 25906f32e7eSjoergdefm : JWriteResIntPair<WriteRotate, [JALU01], 1>; 26006f32e7eSjoergdefm : JWriteResIntPair<WriteRotateCL, [JALU01], 1>; 26106f32e7eSjoerg 26206f32e7eSjoerg// SHLD/SHRD. 26306f32e7eSjoergdefm : X86WriteRes<WriteSHDrri, [JALU01], 3, [6], 6>; 26406f32e7eSjoergdefm : X86WriteRes<WriteSHDrrcl,[JALU01], 4, [8], 7>; 26506f32e7eSjoergdefm : X86WriteRes<WriteSHDmri, [JLAGU, JALU01], 9, [1, 22], 8>; 26606f32e7eSjoergdefm : X86WriteRes<WriteSHDmrcl,[JLAGU, JALU01], 9, [1, 22], 8>; 26706f32e7eSjoerg 26806f32e7eSjoerg//////////////////////////////////////////////////////////////////////////////// 26906f32e7eSjoerg// Loads, stores, and moves, not folded with other operations. 27006f32e7eSjoerg//////////////////////////////////////////////////////////////////////////////// 27106f32e7eSjoerg 27206f32e7eSjoergdef : WriteRes<WriteLoad, [JLAGU]> { let Latency = 3; } 27306f32e7eSjoergdef : WriteRes<WriteStore, [JSAGU]>; 27406f32e7eSjoergdef : WriteRes<WriteStoreNT, [JSAGU]>; 27506f32e7eSjoergdef : WriteRes<WriteMove, [JALU01]>; 27606f32e7eSjoerg 27706f32e7eSjoerg// Load/store MXCSR. 27806f32e7eSjoergdef : WriteRes<WriteLDMXCSR, [JLAGU]> { let Latency = 3; } 27906f32e7eSjoergdef : WriteRes<WriteSTMXCSR, [JSAGU]>; 28006f32e7eSjoerg 28106f32e7eSjoerg// Treat misc copies as a move. 28206f32e7eSjoergdef : InstRW<[WriteMove], (instrs COPY)>; 28306f32e7eSjoerg 28406f32e7eSjoerg//////////////////////////////////////////////////////////////////////////////// 28506f32e7eSjoerg// Idioms that clear a register, like xorps %xmm0, %xmm0. 28606f32e7eSjoerg// These can often bypass execution ports completely. 28706f32e7eSjoerg//////////////////////////////////////////////////////////////////////////////// 28806f32e7eSjoerg 28906f32e7eSjoergdef : WriteRes<WriteZero, []>; 29006f32e7eSjoerg 29106f32e7eSjoerg//////////////////////////////////////////////////////////////////////////////// 29206f32e7eSjoerg// Branches don't produce values, so they have no latency, but they still 29306f32e7eSjoerg// consume resources. Indirect branches can fold loads. 29406f32e7eSjoerg//////////////////////////////////////////////////////////////////////////////// 29506f32e7eSjoerg 29606f32e7eSjoergdefm : JWriteResIntPair<WriteJump, [JALU01], 1>; 29706f32e7eSjoerg 29806f32e7eSjoerg//////////////////////////////////////////////////////////////////////////////// 29906f32e7eSjoerg// Special case scheduling classes. 30006f32e7eSjoerg//////////////////////////////////////////////////////////////////////////////// 30106f32e7eSjoerg 30206f32e7eSjoergdef : WriteRes<WriteSystem, [JALU01]> { let Latency = 100; } 30306f32e7eSjoergdef : WriteRes<WriteMicrocoded, [JALU01]> { let Latency = 100; } 30406f32e7eSjoergdef : WriteRes<WriteFence, [JSAGU]>; 30506f32e7eSjoerg 30606f32e7eSjoerg// Nops don't have dependencies, so there's no actual latency, but we set this 30706f32e7eSjoerg// to '1' to tell the scheduler that the nop uses an ALU slot for a cycle. 30806f32e7eSjoergdef : WriteRes<WriteNop, [JALU01]> { let Latency = 1; } 30906f32e7eSjoerg 31006f32e7eSjoergdef JWriteCMPXCHG8rr : SchedWriteRes<[JALU01]> { 31106f32e7eSjoerg let Latency = 3; 31206f32e7eSjoerg let ResourceCycles = [3]; 31306f32e7eSjoerg let NumMicroOps = 3; 31406f32e7eSjoerg} 31506f32e7eSjoerg 31606f32e7eSjoergdef JWriteLOCK_CMPXCHG8rm : SchedWriteRes<[JALU01, JLAGU, JSAGU]> { 31706f32e7eSjoerg let Latency = 16; 31806f32e7eSjoerg let ResourceCycles = [3,16,16]; 31906f32e7eSjoerg let NumMicroOps = 5; 32006f32e7eSjoerg} 32106f32e7eSjoerg 32206f32e7eSjoergdef JWriteLOCK_CMPXCHGrm : SchedWriteRes<[JALU01, JLAGU, JSAGU]> { 32306f32e7eSjoerg let Latency = 17; 32406f32e7eSjoerg let ResourceCycles = [3,17,17]; 32506f32e7eSjoerg let NumMicroOps = 6; 32606f32e7eSjoerg} 32706f32e7eSjoerg 32806f32e7eSjoergdef JWriteCMPXCHG8rm : SchedWriteRes<[JALU01, JLAGU, JSAGU]> { 32906f32e7eSjoerg let Latency = 11; 33006f32e7eSjoerg let ResourceCycles = [3,1,1]; 33106f32e7eSjoerg let NumMicroOps = 5; 33206f32e7eSjoerg} 33306f32e7eSjoerg 33406f32e7eSjoergdef JWriteCMPXCHG8B : SchedWriteRes<[JALU01, JLAGU, JSAGU]> { 33506f32e7eSjoerg let Latency = 11; 33606f32e7eSjoerg let ResourceCycles = [3,1,1]; 33706f32e7eSjoerg let NumMicroOps = 18; 33806f32e7eSjoerg} 33906f32e7eSjoerg 34006f32e7eSjoergdef JWriteCMPXCHG16B : SchedWriteRes<[JALU01, JLAGU, JSAGU]> { 34106f32e7eSjoerg let Latency = 32; 34206f32e7eSjoerg let ResourceCycles = [6,1,1]; 34306f32e7eSjoerg let NumMicroOps = 28; 34406f32e7eSjoerg} 34506f32e7eSjoerg 34606f32e7eSjoergdef JWriteLOCK_CMPXCHG8B : SchedWriteRes<[JALU01, JLAGU, JSAGU]> { 34706f32e7eSjoerg let Latency = 19; 34806f32e7eSjoerg let ResourceCycles = [3,19,19]; 34906f32e7eSjoerg let NumMicroOps = 18; 35006f32e7eSjoerg} 35106f32e7eSjoerg 35206f32e7eSjoergdef JWriteLOCK_CMPXCHG16B : SchedWriteRes<[JALU01, JLAGU, JSAGU]> { 35306f32e7eSjoerg let Latency = 38; 35406f32e7eSjoerg let ResourceCycles = [6,38,38]; 35506f32e7eSjoerg let NumMicroOps = 28; 35606f32e7eSjoerg} 35706f32e7eSjoerg 35806f32e7eSjoergdef JWriteCMPXCHGVariant : SchedWriteVariant<[ 35906f32e7eSjoerg SchedVar<MCSchedPredicate<IsAtomicCompareAndSwap8B>, [JWriteLOCK_CMPXCHG8B]>, 36006f32e7eSjoerg SchedVar<MCSchedPredicate<IsAtomicCompareAndSwap16B>, [JWriteLOCK_CMPXCHG16B]>, 36106f32e7eSjoerg SchedVar<MCSchedPredicate<IsAtomicCompareAndSwap_8>, [JWriteLOCK_CMPXCHG8rm]>, 36206f32e7eSjoerg SchedVar<MCSchedPredicate<IsAtomicCompareAndSwap>, [JWriteLOCK_CMPXCHGrm]>, 36306f32e7eSjoerg SchedVar<MCSchedPredicate<IsCompareAndSwap8B>, [JWriteCMPXCHG8B]>, 36406f32e7eSjoerg SchedVar<MCSchedPredicate<IsCompareAndSwap16B>, [JWriteCMPXCHG16B]>, 36506f32e7eSjoerg SchedVar<MCSchedPredicate<IsRegMemCompareAndSwap_8>, [JWriteCMPXCHG8rm]>, 36606f32e7eSjoerg SchedVar<MCSchedPredicate<IsRegMemCompareAndSwap>, [WriteCMPXCHGRMW]>, 36706f32e7eSjoerg SchedVar<MCSchedPredicate<IsRegRegCompareAndSwap_8>, [JWriteCMPXCHG8rr]>, 36806f32e7eSjoerg SchedVar<NoSchedPred, [WriteCMPXCHG]> 36906f32e7eSjoerg]>; 37006f32e7eSjoerg 37106f32e7eSjoerg// The first five reads are contributed by the memory load operand. 37206f32e7eSjoerg// We ignore those reads and set a read-advance for the other input operands 37306f32e7eSjoerg// including the implicit read of RAX. 37406f32e7eSjoergdef : InstRW<[JWriteCMPXCHGVariant, 37506f32e7eSjoerg ReadDefault, ReadDefault, ReadDefault, ReadDefault, ReadDefault, 37606f32e7eSjoerg ReadAfterLd, ReadAfterLd], (instrs LCMPXCHG8, LCMPXCHG16, 37706f32e7eSjoerg LCMPXCHG32, LCMPXCHG64, 37806f32e7eSjoerg CMPXCHG8rm, CMPXCHG16rm, 37906f32e7eSjoerg CMPXCHG32rm, CMPXCHG64rm)>; 38006f32e7eSjoerg 38106f32e7eSjoergdef : InstRW<[JWriteCMPXCHGVariant], (instrs CMPXCHG8rr, CMPXCHG16rr, 38206f32e7eSjoerg CMPXCHG32rr, CMPXCHG64rr)>; 38306f32e7eSjoerg 38406f32e7eSjoergdef : InstRW<[JWriteCMPXCHGVariant, 38506f32e7eSjoerg // Ignore reads contributed by the memory operand. 38606f32e7eSjoerg ReadDefault, ReadDefault, ReadDefault, ReadDefault, ReadDefault, 38706f32e7eSjoerg // Add a read-advance to every implicit register read. 38806f32e7eSjoerg ReadAfterLd, ReadAfterLd, ReadAfterLd, ReadAfterLd], (instrs LCMPXCHG8B, LCMPXCHG16B, 38906f32e7eSjoerg CMPXCHG8B, CMPXCHG16B)>; 39006f32e7eSjoerg 39106f32e7eSjoergdef JWriteLOCK_ALURMW : SchedWriteRes<[JALU01, JLAGU, JSAGU]> { 39206f32e7eSjoerg let Latency = 19; 39306f32e7eSjoerg let ResourceCycles = [1,19,19]; 39406f32e7eSjoerg let NumMicroOps = 1; 39506f32e7eSjoerg} 39606f32e7eSjoerg 39706f32e7eSjoergdef JWriteLOCK_ALURMWVariant : SchedWriteVariant<[ 39806f32e7eSjoerg SchedVar<MCSchedPredicate<CheckLockPrefix>, [JWriteLOCK_ALURMW]>, 39906f32e7eSjoerg SchedVar<NoSchedPred, [WriteALURMW]> 40006f32e7eSjoerg]>; 40106f32e7eSjoergdef : InstRW<[JWriteLOCK_ALURMWVariant], (instrs INC8m, INC16m, INC32m, INC64m, 40206f32e7eSjoerg DEC8m, DEC16m, DEC32m, DEC64m, 40306f32e7eSjoerg NOT8m, NOT16m, NOT32m, NOT64m, 40406f32e7eSjoerg NEG8m, NEG16m, NEG32m, NEG64m)>; 40506f32e7eSjoerg 40606f32e7eSjoergdef JWriteXCHG8rr_XADDrr : SchedWriteRes<[JALU01]> { 40706f32e7eSjoerg let Latency = 2; 40806f32e7eSjoerg let ResourceCycles = [3]; 40906f32e7eSjoerg let NumMicroOps = 3; 41006f32e7eSjoerg} 41106f32e7eSjoergdef : InstRW<[JWriteXCHG8rr_XADDrr], (instrs XCHG8rr, XADD8rr, XADD16rr, 41206f32e7eSjoerg XADD32rr, XADD64rr)>; 41306f32e7eSjoerg 41406f32e7eSjoerg// This write defines the latency of the in/out register operand of a non-atomic 41506f32e7eSjoerg// XADDrm. This is the first of a pair of writes that model non-atomic 41606f32e7eSjoerg// XADDrm instructions (the second write definition is JWriteXADDrm_LdSt_Part). 41706f32e7eSjoerg// 41806f32e7eSjoerg// We need two writes because the instruction latency differs from the output 41906f32e7eSjoerg// register operand latency. In particular, the first write describes the first 42006f32e7eSjoerg// (and only) output register operand of the instruction. However, the 42106f32e7eSjoerg// instruction latency is set to the MAX of all the write latencies. That's why 42206f32e7eSjoerg// a second write is needed in this case (see example below). 42306f32e7eSjoerg// 42406f32e7eSjoerg// Example: 42506f32e7eSjoerg// XADD %ecx, (%rsp) ## Instruction latency: 11cy 42606f32e7eSjoerg// ## ECX write Latency: 3cy 42706f32e7eSjoerg// 42806f32e7eSjoerg// Register ECX becomes available in 3 cycles. That is because the value of ECX 42906f32e7eSjoerg// is exchanged with the value read from the stack pointer, and the load-to-use 43006f32e7eSjoerg// latency is assumed to be 3cy. 43106f32e7eSjoergdef JWriteXADDrm_XCHG_Part : SchedWriteRes<[JALU01]> { 43206f32e7eSjoerg let Latency = 3; // load-to-use latency 43306f32e7eSjoerg let ResourceCycles = [3]; 43406f32e7eSjoerg let NumMicroOps = 3; 43506f32e7eSjoerg} 43606f32e7eSjoerg 43706f32e7eSjoerg// This write defines the latency of the in/out register operand of an atomic 43806f32e7eSjoerg// XADDrm. This is the first of a sequence of two writes used to model atomic 43906f32e7eSjoerg// XADD instructions. The second write of the sequence is JWriteXCHGrm_LdSt_Part. 44006f32e7eSjoerg// 44106f32e7eSjoerg// 44206f32e7eSjoerg// Example: 44306f32e7eSjoerg// LOCK XADD %ecx, (%rsp) ## Instruction Latency: 16cy 44406f32e7eSjoerg// ## ECX write Latency: 11cy 44506f32e7eSjoerg// 44606f32e7eSjoerg// The value of ECX becomes available only after 11cy from the start of 44706f32e7eSjoerg// execution. This write is used to specifically set that operand latency. 44806f32e7eSjoergdef JWriteLOCK_XADDrm_XCHG_Part : SchedWriteRes<[JALU01]> { 44906f32e7eSjoerg let Latency = 11; 45006f32e7eSjoerg let ResourceCycles = [3]; 45106f32e7eSjoerg let NumMicroOps = 3; 45206f32e7eSjoerg} 45306f32e7eSjoerg 45406f32e7eSjoerg// This write defines the latency of the in/out register operand of an atomic 45506f32e7eSjoerg// XCHGrm. This write is the first of a sequence of two writes that describe 45606f32e7eSjoerg// atomic XCHG operations. We need two writes because the instruction latency 45706f32e7eSjoerg// differs from the output register write latency. We want to make sure that 45806f32e7eSjoerg// the output register operand becomes visible after 11cy. However, we want to 45906f32e7eSjoerg// set the instruction latency to 16cy. 46006f32e7eSjoergdef JWriteXCHGrm_XCHG_Part : SchedWriteRes<[JALU01]> { 46106f32e7eSjoerg let Latency = 11; 46206f32e7eSjoerg let ResourceCycles = [2]; 46306f32e7eSjoerg let NumMicroOps = 2; 46406f32e7eSjoerg} 46506f32e7eSjoerg 46606f32e7eSjoergdef JWriteXADDrm_LdSt_Part : SchedWriteRes<[JLAGU, JSAGU]> { 46706f32e7eSjoerg let Latency = 11; 46806f32e7eSjoerg let ResourceCycles = [1, 1]; 46906f32e7eSjoerg let NumMicroOps = 1; 47006f32e7eSjoerg} 47106f32e7eSjoerg 47206f32e7eSjoergdef JWriteXCHGrm_LdSt_Part : SchedWriteRes<[JLAGU, JSAGU]> { 47306f32e7eSjoerg let Latency = 16; 47406f32e7eSjoerg let ResourceCycles = [16, 16]; 47506f32e7eSjoerg let NumMicroOps = 1; 47606f32e7eSjoerg} 47706f32e7eSjoerg 47806f32e7eSjoergdef JWriteXADDrm_Part1 : SchedWriteVariant<[ 47906f32e7eSjoerg SchedVar<MCSchedPredicate<CheckLockPrefix>, [JWriteLOCK_XADDrm_XCHG_Part]>, 48006f32e7eSjoerg SchedVar<NoSchedPred, [JWriteXADDrm_XCHG_Part]> 48106f32e7eSjoerg]>; 48206f32e7eSjoerg 48306f32e7eSjoergdef JWriteXADDrm_Part2 : SchedWriteVariant<[ 48406f32e7eSjoerg SchedVar<MCSchedPredicate<CheckLockPrefix>, [JWriteXCHGrm_LdSt_Part]>, 48506f32e7eSjoerg SchedVar<NoSchedPred, [JWriteXADDrm_LdSt_Part]> 48606f32e7eSjoerg]>; 48706f32e7eSjoerg 48806f32e7eSjoergdef : InstRW<[JWriteXADDrm_Part1, JWriteXADDrm_Part2, ReadAfterLd], 48906f32e7eSjoerg (instrs XADD8rm, XADD16rm, XADD32rm, XADD64rm, 49006f32e7eSjoerg LXADD8, LXADD16, LXADD32, LXADD64)>; 49106f32e7eSjoerg 49206f32e7eSjoergdef : InstRW<[JWriteXCHGrm_XCHG_Part, JWriteXCHGrm_LdSt_Part, ReadAfterLd], 49306f32e7eSjoerg (instrs XCHG8rm, XCHG16rm, XCHG32rm, XCHG64rm)>; 49406f32e7eSjoerg 49506f32e7eSjoerg 49606f32e7eSjoerg//////////////////////////////////////////////////////////////////////////////// 49706f32e7eSjoerg// Floating point. This covers both scalar and vector operations. 49806f32e7eSjoerg//////////////////////////////////////////////////////////////////////////////// 49906f32e7eSjoerg 50006f32e7eSjoergdefm : X86WriteRes<WriteFLD0, [JFPU1, JSTC], 3, [1,1], 1>; 50106f32e7eSjoergdefm : X86WriteRes<WriteFLD1, [JFPU1, JSTC], 3, [1,1], 1>; 50206f32e7eSjoergdefm : X86WriteRes<WriteFLDC, [JFPU1, JSTC], 3, [1,1], 1>; 50306f32e7eSjoergdefm : X86WriteRes<WriteFLoad, [JLAGU, JFPU01, JFPX], 5, [1, 1, 1], 1>; 50406f32e7eSjoergdefm : X86WriteRes<WriteFLoadX, [JLAGU], 5, [1], 1>; 50506f32e7eSjoergdefm : X86WriteRes<WriteFLoadY, [JLAGU], 5, [2], 2>; 50606f32e7eSjoergdefm : X86WriteRes<WriteFMaskedLoad, [JLAGU, JFPU01, JFPX], 6, [1, 2, 2], 1>; 50706f32e7eSjoergdefm : X86WriteRes<WriteFMaskedLoadY, [JLAGU, JFPU01, JFPX], 6, [2, 4, 4], 2>; 50806f32e7eSjoerg 50906f32e7eSjoergdefm : X86WriteRes<WriteFStore, [JSAGU, JFPU1, JSTC], 2, [1, 1, 1], 1>; 51006f32e7eSjoergdefm : X86WriteRes<WriteFStoreX, [JSAGU, JFPU1, JSTC], 1, [1, 1, 1], 1>; 51106f32e7eSjoergdefm : X86WriteRes<WriteFStoreY, [JSAGU, JFPU1, JSTC], 1, [2, 2, 2], 2>; 51206f32e7eSjoergdefm : X86WriteRes<WriteFStoreNT, [JSAGU, JFPU1, JSTC], 3, [1, 1, 1], 1>; 51306f32e7eSjoergdefm : X86WriteRes<WriteFStoreNTX, [JSAGU, JFPU1, JSTC], 3, [1, 1, 1], 1>; 51406f32e7eSjoergdefm : X86WriteRes<WriteFStoreNTY, [JSAGU, JFPU1, JSTC], 3, [2, 2, 2], 1>; 51506f32e7eSjoerg 51606f32e7eSjoergdefm : X86WriteRes<WriteFMaskedStore32, [JFPU0, JFPA, JFPU1, JSTC, JLAGU, JSAGU, JALU01], 16, [1,1, 5, 5,4,4,4], 19>; 51706f32e7eSjoergdefm : X86WriteRes<WriteFMaskedStore64, [JFPU0, JFPA, JFPU1, JSTC, JLAGU, JSAGU, JALU01], 13, [1,1, 2, 2,2,2,2], 10>; 51806f32e7eSjoergdefm : X86WriteRes<WriteFMaskedStore32Y, [JFPU0, JFPA, JFPU1, JSTC, JLAGU, JSAGU, JALU01], 22, [1,1,10,10,8,8,8], 36>; 51906f32e7eSjoergdefm : X86WriteRes<WriteFMaskedStore64Y, [JFPU0, JFPA, JFPU1, JSTC, JLAGU, JSAGU, JALU01], 16, [1,1, 4, 4,4,4,4], 18>; 52006f32e7eSjoerg 52106f32e7eSjoergdefm : X86WriteRes<WriteFMove, [JFPU01, JFPX], 1, [1, 1], 1>; 52206f32e7eSjoergdefm : X86WriteRes<WriteFMoveX, [JFPU01, JFPX], 1, [1, 1], 1>; 52306f32e7eSjoergdefm : X86WriteRes<WriteFMoveY, [JFPU01, JFPX], 1, [2, 2], 2>; 52406f32e7eSjoerg 52506f32e7eSjoergdefm : X86WriteRes<WriteEMMS, [JFPU01, JFPX], 2, [1, 1], 1>; 52606f32e7eSjoerg 52706f32e7eSjoergdefm : JWriteResFpuPair<WriteFAdd, [JFPU0, JFPA], 3>; 52806f32e7eSjoergdefm : JWriteResFpuPair<WriteFAddX, [JFPU0, JFPA], 3>; 52906f32e7eSjoergdefm : JWriteResYMMPair<WriteFAddY, [JFPU0, JFPA], 3, [2,2], 2>; 53006f32e7eSjoergdefm : X86WriteResPairUnsupported<WriteFAddZ>; 53106f32e7eSjoergdefm : JWriteResFpuPair<WriteFAdd64, [JFPU0, JFPA], 3>; 53206f32e7eSjoergdefm : JWriteResFpuPair<WriteFAdd64X, [JFPU0, JFPA], 3>; 53306f32e7eSjoergdefm : JWriteResYMMPair<WriteFAdd64Y, [JFPU0, JFPA], 3, [2,2], 2>; 53406f32e7eSjoergdefm : X86WriteResPairUnsupported<WriteFAdd64Z>; 53506f32e7eSjoergdefm : JWriteResFpuPair<WriteFCmp, [JFPU0, JFPA], 2>; 53606f32e7eSjoergdefm : JWriteResFpuPair<WriteFCmpX, [JFPU0, JFPA], 2>; 53706f32e7eSjoergdefm : JWriteResYMMPair<WriteFCmpY, [JFPU0, JFPA], 2, [2,2], 2>; 53806f32e7eSjoergdefm : X86WriteResPairUnsupported<WriteFCmpZ>; 53906f32e7eSjoergdefm : JWriteResFpuPair<WriteFCmp64, [JFPU0, JFPA], 2>; 54006f32e7eSjoergdefm : JWriteResFpuPair<WriteFCmp64X, [JFPU0, JFPA], 2>; 54106f32e7eSjoergdefm : JWriteResYMMPair<WriteFCmp64Y, [JFPU0, JFPA], 2, [2,2], 2>; 54206f32e7eSjoergdefm : X86WriteResPairUnsupported<WriteFCmp64Z>; 54306f32e7eSjoergdefm : JWriteResFpuPair<WriteFCom, [JFPU0, JFPA, JALU0], 3>; 544*da58b97aSjoergdefm : JWriteResFpuPair<WriteFComX, [JFPU0, JFPA, JALU0], 3>; 54506f32e7eSjoergdefm : JWriteResFpuPair<WriteFMul, [JFPU1, JFPM], 2>; 54606f32e7eSjoergdefm : JWriteResFpuPair<WriteFMulX, [JFPU1, JFPM], 2>; 54706f32e7eSjoergdefm : JWriteResYMMPair<WriteFMulY, [JFPU1, JFPM], 2, [2,2], 2>; 54806f32e7eSjoergdefm : X86WriteResPairUnsupported<WriteFMulZ>; 54906f32e7eSjoergdefm : JWriteResFpuPair<WriteFMul64, [JFPU1, JFPM], 4, [1,2]>; 55006f32e7eSjoergdefm : JWriteResFpuPair<WriteFMul64X, [JFPU1, JFPM], 4, [1,2]>; 55106f32e7eSjoergdefm : JWriteResYMMPair<WriteFMul64Y, [JFPU1, JFPM], 4, [2,4], 2>; 55206f32e7eSjoergdefm : X86WriteResPairUnsupported<WriteFMul64Z>; 55306f32e7eSjoergdefm : X86WriteResPairUnsupported<WriteFMA>; 55406f32e7eSjoergdefm : X86WriteResPairUnsupported<WriteFMAX>; 55506f32e7eSjoergdefm : X86WriteResPairUnsupported<WriteFMAY>; 55606f32e7eSjoergdefm : X86WriteResPairUnsupported<WriteFMAZ>; 55706f32e7eSjoergdefm : JWriteResFpuPair<WriteDPPD, [JFPU1, JFPM, JFPA], 9, [1, 3, 3], 3>; 55806f32e7eSjoergdefm : JWriteResFpuPair<WriteDPPS, [JFPU1, JFPM, JFPA], 11, [1, 3, 3], 5>; 55906f32e7eSjoergdefm : JWriteResYMMPair<WriteDPPSY, [JFPU1, JFPM, JFPA], 12, [2, 6, 6], 10>; 56006f32e7eSjoergdefm : X86WriteResPairUnsupported<WriteDPPSZ>; 56106f32e7eSjoergdefm : JWriteResFpuPair<WriteFRcp, [JFPU1, JFPM], 2>; 56206f32e7eSjoergdefm : JWriteResFpuPair<WriteFRcpX, [JFPU1, JFPM], 2>; 56306f32e7eSjoergdefm : JWriteResYMMPair<WriteFRcpY, [JFPU1, JFPM], 2, [2,2], 2>; 56406f32e7eSjoergdefm : X86WriteResPairUnsupported<WriteFRcpZ>; 56506f32e7eSjoergdefm : JWriteResFpuPair<WriteFRsqrt, [JFPU1, JFPM], 2>; 56606f32e7eSjoergdefm : JWriteResFpuPair<WriteFRsqrtX, [JFPU1, JFPM], 2>; 56706f32e7eSjoergdefm : JWriteResYMMPair<WriteFRsqrtY, [JFPU1, JFPM], 2, [2,2], 2>; 56806f32e7eSjoergdefm : X86WriteResPairUnsupported<WriteFRsqrtZ>; 56906f32e7eSjoergdefm : JWriteResFpuPair<WriteFDiv, [JFPU1, JFPM], 19, [1, 19]>; 57006f32e7eSjoergdefm : JWriteResFpuPair<WriteFDivX, [JFPU1, JFPM], 19, [1, 19]>; 57106f32e7eSjoergdefm : JWriteResYMMPair<WriteFDivY, [JFPU1, JFPM], 38, [2, 38], 2>; 57206f32e7eSjoergdefm : X86WriteResPairUnsupported<WriteFDivZ>; 57306f32e7eSjoergdefm : JWriteResFpuPair<WriteFDiv64, [JFPU1, JFPM], 19, [1, 19]>; 57406f32e7eSjoergdefm : JWriteResFpuPair<WriteFDiv64X, [JFPU1, JFPM], 19, [1, 19]>; 57506f32e7eSjoergdefm : JWriteResYMMPair<WriteFDiv64Y, [JFPU1, JFPM], 38, [2, 38], 2>; 57606f32e7eSjoergdefm : X86WriteResPairUnsupported<WriteFDiv64Z>; 57706f32e7eSjoergdefm : JWriteResFpuPair<WriteFSqrt, [JFPU1, JFPM], 21, [1, 21]>; 57806f32e7eSjoergdefm : JWriteResFpuPair<WriteFSqrtX, [JFPU1, JFPM], 21, [1, 21]>; 57906f32e7eSjoergdefm : JWriteResYMMPair<WriteFSqrtY, [JFPU1, JFPM], 42, [2, 42], 2>; 58006f32e7eSjoergdefm : X86WriteResPairUnsupported<WriteFSqrtZ>; 58106f32e7eSjoergdefm : JWriteResFpuPair<WriteFSqrt64, [JFPU1, JFPM], 27, [1, 27]>; 58206f32e7eSjoergdefm : JWriteResFpuPair<WriteFSqrt64X, [JFPU1, JFPM], 27, [1, 27]>; 58306f32e7eSjoergdefm : JWriteResYMMPair<WriteFSqrt64Y, [JFPU1, JFPM], 54, [2, 54], 2>; 58406f32e7eSjoergdefm : X86WriteResPairUnsupported<WriteFSqrt64Z>; 58506f32e7eSjoergdefm : JWriteResFpuPair<WriteFSqrt80, [JFPU1, JFPM], 35, [1, 35]>; 58606f32e7eSjoergdefm : JWriteResFpuPair<WriteFSign, [JFPU1, JFPM], 2>; 58706f32e7eSjoergdefm : JWriteResFpuPair<WriteFRnd, [JFPU1, JSTC], 3>; 58806f32e7eSjoergdefm : JWriteResYMMPair<WriteFRndY, [JFPU1, JSTC], 3, [2,2], 2>; 58906f32e7eSjoergdefm : X86WriteResPairUnsupported<WriteFRndZ>; 59006f32e7eSjoergdefm : JWriteResFpuPair<WriteFLogic, [JFPU01, JFPX], 1>; 59106f32e7eSjoergdefm : JWriteResYMMPair<WriteFLogicY, [JFPU01, JFPX], 1, [2, 2], 2>; 59206f32e7eSjoergdefm : X86WriteResPairUnsupported<WriteFLogicZ>; 59306f32e7eSjoergdefm : JWriteResFpuPair<WriteFTest, [JFPU0, JFPA, JALU0], 3>; 59406f32e7eSjoergdefm : JWriteResYMMPair<WriteFTestY , [JFPU01, JFPX, JFPA, JALU0], 4, [2, 2, 2, 1], 3>; 59506f32e7eSjoergdefm : X86WriteResPairUnsupported<WriteFTestZ>; 59606f32e7eSjoergdefm : JWriteResFpuPair<WriteFShuffle, [JFPU01, JFPX], 1>; 59706f32e7eSjoergdefm : JWriteResYMMPair<WriteFShuffleY, [JFPU01, JFPX], 1, [2, 2], 2>; 59806f32e7eSjoergdefm : X86WriteResPairUnsupported<WriteFShuffleZ>; 59906f32e7eSjoergdefm : JWriteResFpuPair<WriteFVarShuffle, [JFPU01, JFPX], 3, [1, 4], 3>; // +1cy latency. 60006f32e7eSjoergdefm : JWriteResYMMPair<WriteFVarShuffleY,[JFPU01, JFPX], 4, [2, 6], 6>; // +1cy latency. 60106f32e7eSjoergdefm : X86WriteResPairUnsupported<WriteFVarShuffleZ>; 60206f32e7eSjoergdefm : JWriteResFpuPair<WriteFBlend, [JFPU01, JFPX], 1>; 60306f32e7eSjoergdefm : JWriteResYMMPair<WriteFBlendY, [JFPU01, JFPX], 1, [2, 2], 2>; 60406f32e7eSjoergdefm : X86WriteResPairUnsupported<WriteFBlendZ>; 60506f32e7eSjoergdefm : JWriteResFpuPair<WriteFVarBlend, [JFPU01, JFPX], 2, [4, 4], 3>; 60606f32e7eSjoergdefm : JWriteResYMMPair<WriteFVarBlendY, [JFPU01, JFPX], 3, [6, 6], 6>; 60706f32e7eSjoergdefm : X86WriteResPairUnsupported<WriteFVarBlendZ>; 60806f32e7eSjoergdefm : JWriteResFpuPair<WriteFShuffle256, [JFPU01, JFPX], 1, [2, 2], 2>; 60906f32e7eSjoergdefm : X86WriteResPairUnsupported<WriteFVarShuffle256>; 61006f32e7eSjoerg 61106f32e7eSjoerg//////////////////////////////////////////////////////////////////////////////// 61206f32e7eSjoerg// Conversions. 61306f32e7eSjoerg//////////////////////////////////////////////////////////////////////////////// 61406f32e7eSjoerg 61506f32e7eSjoergdefm : JWriteResFpuPair<WriteCvtSS2I, [JFPU1, JSTC, JFPU0, JFPA, JALU0], 7, [1,1,1,1,1], 2>; 61606f32e7eSjoergdefm : JWriteResFpuPair<WriteCvtPS2I, [JFPU1, JSTC], 3, [1,1], 1>; 61706f32e7eSjoergdefm : JWriteResYMMPair<WriteCvtPS2IY, [JFPU1, JSTC], 3, [2,2], 2>; 61806f32e7eSjoergdefm : X86WriteResPairUnsupported<WriteCvtPS2IZ>; 61906f32e7eSjoergdefm : JWriteResFpuPair<WriteCvtSD2I, [JFPU1, JSTC, JFPU0, JFPA, JALU0], 7, [1,1,1,1,1], 2>; 62006f32e7eSjoergdefm : JWriteResFpuPair<WriteCvtPD2I, [JFPU1, JSTC], 3, [1,1], 1>; 62106f32e7eSjoergdefm : JWriteResYMMPair<WriteCvtPD2IY, [JFPU1, JSTC, JFPX], 6, [2,2,4], 3>; 62206f32e7eSjoergdefm : X86WriteResPairUnsupported<WriteCvtPD2IZ>; 62306f32e7eSjoerg 62406f32e7eSjoergdefm : X86WriteRes<WriteCvtI2SS, [JFPU1, JSTC], 4, [1,1], 2>; 62506f32e7eSjoergdefm : X86WriteRes<WriteCvtI2SSLd, [JLAGU, JFPU1, JSTC], 9, [1,1,1], 1>; 62606f32e7eSjoergdefm : JWriteResFpuPair<WriteCvtI2PS, [JFPU1, JSTC], 3, [1,1], 1>; 62706f32e7eSjoergdefm : JWriteResYMMPair<WriteCvtI2PSY, [JFPU1, JSTC], 3, [2,2], 2>; 62806f32e7eSjoergdefm : X86WriteResPairUnsupported<WriteCvtI2PSZ>; 62906f32e7eSjoergdefm : X86WriteRes<WriteCvtI2SD, [JFPU1, JSTC], 4, [1,1], 2>; 63006f32e7eSjoergdefm : X86WriteRes<WriteCvtI2SDLd, [JLAGU, JFPU1, JSTC], 9, [1,1,1], 1>; 63106f32e7eSjoergdefm : JWriteResFpuPair<WriteCvtI2PD, [JFPU1, JSTC], 3, [1,1], 1>; 63206f32e7eSjoergdefm : JWriteResYMMPair<WriteCvtI2PDY, [JFPU1, JSTC], 3, [2,2], 2>; 63306f32e7eSjoergdefm : X86WriteResPairUnsupported<WriteCvtI2PDZ>; 63406f32e7eSjoerg 63506f32e7eSjoergdefm : JWriteResFpuPair<WriteCvtSS2SD, [JFPU1, JSTC], 7, [1,2], 2>; 63606f32e7eSjoergdefm : JWriteResFpuPair<WriteCvtPS2PD, [JFPU1, JSTC], 2, [1,1], 1>; 63706f32e7eSjoergdefm : JWriteResYMMPair<WriteCvtPS2PDY, [JFPU1, JSTC], 2, [2,2], 2>; 63806f32e7eSjoergdefm : X86WriteResPairUnsupported<WriteCvtPS2PDZ>; 63906f32e7eSjoerg 64006f32e7eSjoergdefm : JWriteResFpuPair<WriteCvtSD2SS, [JFPU1, JSTC], 7, [1,2], 2>; 64106f32e7eSjoergdefm : JWriteResFpuPair<WriteCvtPD2PS, [JFPU1, JSTC], 3, [1,1], 1>; 64206f32e7eSjoergdefm : JWriteResYMMPair<WriteCvtPD2PSY, [JFPU1, JSTC, JFPX], 6, [2,2,4], 3>; 64306f32e7eSjoergdefm : X86WriteResPairUnsupported<WriteCvtPD2PSZ>; 64406f32e7eSjoerg 64506f32e7eSjoergdefm : JWriteResFpuPair<WriteCvtPH2PS, [JFPU1, JSTC], 3, [1,1], 1>; 64606f32e7eSjoergdefm : JWriteResYMMPair<WriteCvtPH2PSY, [JFPU1, JSTC], 3, [2,2], 2>; 64706f32e7eSjoergdefm : X86WriteResPairUnsupported<WriteCvtPH2PSZ>; 64806f32e7eSjoerg 64906f32e7eSjoergdefm : X86WriteRes<WriteCvtPS2PH, [JFPU1, JSTC], 3, [1,1], 1>; 65006f32e7eSjoergdefm : X86WriteRes<WriteCvtPS2PHY, [JFPU1, JSTC, JFPX], 6, [2,2,2], 3>; 65106f32e7eSjoergdefm : X86WriteResUnsupported<WriteCvtPS2PHZ>; 65206f32e7eSjoergdefm : X86WriteRes<WriteCvtPS2PHSt, [JFPU1, JSTC, JSAGU], 4, [1,1,1], 1>; 65306f32e7eSjoergdefm : X86WriteRes<WriteCvtPS2PHYSt, [JFPU1, JSTC, JFPX, JSAGU], 7, [2,2,2,1], 3>; 65406f32e7eSjoergdefm : X86WriteResUnsupported<WriteCvtPS2PHZSt>; 65506f32e7eSjoerg 65606f32e7eSjoerg//////////////////////////////////////////////////////////////////////////////// 65706f32e7eSjoerg// Vector integer operations. 65806f32e7eSjoerg//////////////////////////////////////////////////////////////////////////////// 65906f32e7eSjoerg 66006f32e7eSjoergdefm : X86WriteRes<WriteVecLoad, [JLAGU, JFPU01, JVALU], 5, [1, 1, 1], 1>; 66106f32e7eSjoergdefm : X86WriteRes<WriteVecLoadX, [JLAGU], 5, [1], 1>; 66206f32e7eSjoergdefm : X86WriteRes<WriteVecLoadY, [JLAGU], 5, [2], 2>; 66306f32e7eSjoergdefm : X86WriteRes<WriteVecLoadNT, [JLAGU, JFPU01, JVALU], 5, [1, 1, 1], 1>; 66406f32e7eSjoergdefm : X86WriteRes<WriteVecLoadNTY, [JLAGU, JFPU01, JVALU], 5, [1, 1, 1], 1>; 66506f32e7eSjoergdefm : X86WriteRes<WriteVecMaskedLoad, [JLAGU, JFPU01, JVALU], 6, [1, 2, 2], 1>; 66606f32e7eSjoergdefm : X86WriteRes<WriteVecMaskedLoadY, [JLAGU, JFPU01, JVALU], 6, [2, 4, 4], 2>; 66706f32e7eSjoerg 66806f32e7eSjoergdefm : X86WriteRes<WriteVecStore, [JSAGU, JFPU1, JSTC], 2, [1, 1, 1], 1>; 66906f32e7eSjoergdefm : X86WriteRes<WriteVecStoreX, [JSAGU, JFPU1, JSTC], 1, [1, 1, 1], 1>; 67006f32e7eSjoergdefm : X86WriteRes<WriteVecStoreY, [JSAGU, JFPU1, JSTC], 1, [2, 2, 2], 2>; 67106f32e7eSjoergdefm : X86WriteRes<WriteVecStoreNT, [JSAGU, JFPU1, JSTC], 2, [1, 1, 1], 1>; 67206f32e7eSjoergdefm : X86WriteRes<WriteVecStoreNTY, [JSAGU, JFPU1, JSTC], 2, [2, 2, 2], 1>; 673*da58b97aSjoergdefm : X86WriteResUnsupported<WriteVecMaskedStore32>; 674*da58b97aSjoergdefm : X86WriteResUnsupported<WriteVecMaskedStore64>; 675*da58b97aSjoergdefm : X86WriteResUnsupported<WriteVecMaskedStore32Y>; 676*da58b97aSjoergdefm : X86WriteResUnsupported<WriteVecMaskedStore64Y>; 67706f32e7eSjoerg 67806f32e7eSjoergdefm : X86WriteRes<WriteVecMove, [JFPU01, JVALU], 1, [1, 1], 1>; 67906f32e7eSjoergdefm : X86WriteRes<WriteVecMoveX, [JFPU01, JVALU], 1, [1, 1], 1>; 68006f32e7eSjoergdefm : X86WriteRes<WriteVecMoveY, [JFPU01, JVALU], 1, [2, 2], 2>; 68106f32e7eSjoergdefm : X86WriteRes<WriteVecMoveToGpr, [JFPU0, JFPA, JALU0], 4, [1, 1, 1], 1>; 68206f32e7eSjoergdefm : X86WriteRes<WriteVecMoveFromGpr, [JFPU01, JFPX], 8, [1, 1], 2>; 68306f32e7eSjoerg 68406f32e7eSjoergdefm : JWriteResFpuPair<WriteVecALU, [JFPU01, JVALU], 1>; 68506f32e7eSjoergdefm : JWriteResFpuPair<WriteVecALUX, [JFPU01, JVALU], 1>; 68606f32e7eSjoergdefm : X86WriteResPairUnsupported<WriteVecALUY>; 68706f32e7eSjoergdefm : X86WriteResPairUnsupported<WriteVecALUZ>; 68806f32e7eSjoergdefm : JWriteResFpuPair<WriteVecShift, [JFPU01, JVALU], 1>; 68906f32e7eSjoergdefm : JWriteResFpuPair<WriteVecShiftX, [JFPU01, JVALU], 2>; // +1cy latency. 69006f32e7eSjoergdefm : X86WriteResPairUnsupported<WriteVecShiftY>; 69106f32e7eSjoergdefm : X86WriteResPairUnsupported<WriteVecShiftZ>; 69206f32e7eSjoergdefm : JWriteResFpuPair<WriteVecShiftImm, [JFPU01, JVALU], 1>; 69306f32e7eSjoergdefm : JWriteResFpuPair<WriteVecShiftImmX,[JFPU01, JVALU], 2>; // +1cy latency. 69406f32e7eSjoergdefm : X86WriteResPairUnsupported<WriteVecShiftImmY>; 69506f32e7eSjoergdefm : X86WriteResPairUnsupported<WriteVecShiftImmZ>; 69606f32e7eSjoergdefm : X86WriteResPairUnsupported<WriteVarVecShift>; 69706f32e7eSjoergdefm : X86WriteResPairUnsupported<WriteVarVecShiftY>; 69806f32e7eSjoergdefm : X86WriteResPairUnsupported<WriteVarVecShiftZ>; 69906f32e7eSjoergdefm : JWriteResFpuPair<WriteVecIMul, [JFPU0, JVIMUL], 2>; 70006f32e7eSjoergdefm : JWriteResFpuPair<WriteVecIMulX, [JFPU0, JVIMUL], 2>; 70106f32e7eSjoergdefm : X86WriteResPairUnsupported<WriteVecIMulY>; 70206f32e7eSjoergdefm : X86WriteResPairUnsupported<WriteVecIMulZ>; 70306f32e7eSjoergdefm : JWriteResFpuPair<WritePMULLD, [JFPU0, JFPU01, JVIMUL, JVALU], 4, [2, 1, 2, 1], 3>; 70406f32e7eSjoergdefm : X86WriteResPairUnsupported<WritePMULLDY>; 70506f32e7eSjoergdefm : X86WriteResPairUnsupported<WritePMULLDZ>; 70606f32e7eSjoergdefm : JWriteResFpuPair<WriteMPSAD, [JFPU0, JVIMUL], 3, [1, 2], 3>; 70706f32e7eSjoergdefm : X86WriteResPairUnsupported<WriteMPSADY>; 70806f32e7eSjoergdefm : X86WriteResPairUnsupported<WriteMPSADZ>; 70906f32e7eSjoergdefm : JWriteResFpuPair<WritePSADBW, [JFPU01, JVALU], 2>; 71006f32e7eSjoergdefm : JWriteResFpuPair<WritePSADBWX, [JFPU01, JVALU], 2>; 71106f32e7eSjoergdefm : X86WriteResPairUnsupported<WritePSADBWY>; 71206f32e7eSjoergdefm : X86WriteResPairUnsupported<WritePSADBWZ>; 71306f32e7eSjoergdefm : JWriteResFpuPair<WritePHMINPOS, [JFPU01, JVALU], 2>; 71406f32e7eSjoergdefm : JWriteResFpuPair<WriteShuffle, [JFPU01, JVALU], 1>; 71506f32e7eSjoergdefm : JWriteResFpuPair<WriteShuffleX, [JFPU01, JVALU], 1>; 71606f32e7eSjoergdefm : X86WriteResPairUnsupported<WriteShuffleY>; 71706f32e7eSjoergdefm : X86WriteResPairUnsupported<WriteShuffleZ>; 71806f32e7eSjoergdefm : JWriteResFpuPair<WriteVarShuffle, [JFPU01, JVALU], 2, [1, 1], 1>; 71906f32e7eSjoergdefm : JWriteResFpuPair<WriteVarShuffleX, [JFPU01, JVALU], 2, [1, 4], 3>; 72006f32e7eSjoergdefm : X86WriteResPairUnsupported<WriteVarShuffleY>; 72106f32e7eSjoergdefm : X86WriteResPairUnsupported<WriteVarShuffleZ>; 72206f32e7eSjoergdefm : JWriteResFpuPair<WriteBlend, [JFPU01, JVALU], 1>; 72306f32e7eSjoergdefm : X86WriteResPairUnsupported<WriteBlendY>; 72406f32e7eSjoergdefm : X86WriteResPairUnsupported<WriteBlendZ>; 72506f32e7eSjoergdefm : JWriteResFpuPair<WriteVarBlend, [JFPU01, JVALU], 2, [4, 4], 3>; 72606f32e7eSjoergdefm : X86WriteResPairUnsupported<WriteVarBlendY>; 72706f32e7eSjoergdefm : X86WriteResPairUnsupported<WriteVarBlendZ>; 72806f32e7eSjoergdefm : JWriteResFpuPair<WriteVecLogic, [JFPU01, JVALU], 1>; 72906f32e7eSjoergdefm : JWriteResFpuPair<WriteVecLogicX, [JFPU01, JVALU], 1>; 73006f32e7eSjoergdefm : X86WriteResPairUnsupported<WriteVecLogicY>; 73106f32e7eSjoergdefm : X86WriteResPairUnsupported<WriteVecLogicZ>; 73206f32e7eSjoergdefm : JWriteResFpuPair<WriteVecTest, [JFPU0, JFPA, JALU0], 3>; 73306f32e7eSjoergdefm : JWriteResYMMPair<WriteVecTestY, [JFPU01, JFPX, JFPA, JALU0], 4, [2, 2, 2, 1], 3>; 73406f32e7eSjoergdefm : X86WriteResPairUnsupported<WriteVecTestZ>; 73506f32e7eSjoergdefm : X86WriteResPairUnsupported<WriteShuffle256>; 736*da58b97aSjoergdefm : X86WriteResPairUnsupported<WriteVPMOV256>; 73706f32e7eSjoergdefm : X86WriteResPairUnsupported<WriteVarShuffle256>; 73806f32e7eSjoerg 73906f32e7eSjoerg//////////////////////////////////////////////////////////////////////////////// 74006f32e7eSjoerg// Vector insert/extract operations. 74106f32e7eSjoerg//////////////////////////////////////////////////////////////////////////////// 74206f32e7eSjoerg 74306f32e7eSjoergdefm : X86WriteRes<WriteVecInsert, [JFPU01, JVALU], 1, [1,1], 2>; 74406f32e7eSjoergdefm : X86WriteRes<WriteVecInsertLd, [JFPU01, JVALU, JLAGU], 4, [1,1,1], 1>; 74506f32e7eSjoergdefm : X86WriteRes<WriteVecExtract, [JFPU0, JFPA, JALU0], 3, [1,1,1], 1>; 74606f32e7eSjoergdefm : X86WriteRes<WriteVecExtractSt, [JFPU1, JSTC, JSAGU], 3, [1,1,1], 1>; 74706f32e7eSjoerg 74806f32e7eSjoerg//////////////////////////////////////////////////////////////////////////////// 74906f32e7eSjoerg// SSE42 String instructions. 75006f32e7eSjoerg//////////////////////////////////////////////////////////////////////////////// 75106f32e7eSjoerg 75206f32e7eSjoergdefm : JWriteResFpuPair<WritePCmpIStrI, [JFPU1, JVALU1, JFPU0, JFPA, JALU0], 7, [2, 2, 1, 1, 1], 3>; 75306f32e7eSjoergdefm : JWriteResFpuPair<WritePCmpIStrM, [JFPU1, JVALU1, JFPU0, JFPA, JALU0], 8, [2, 2, 1, 1, 1], 3>; 75406f32e7eSjoergdefm : JWriteResFpuPair<WritePCmpEStrI, [JFPU1, JSAGU, JLAGU, JVALU, JVALU1, JFPA, JALU0], 14, [1, 2, 2, 6, 4, 1, 1], 9>; 75506f32e7eSjoergdefm : JWriteResFpuPair<WritePCmpEStrM, [JFPU1, JSAGU, JLAGU, JVALU, JVALU1, JFPA, JALU0], 14, [1, 2, 2, 6, 4, 1, 1], 9>; 75606f32e7eSjoerg 75706f32e7eSjoerg//////////////////////////////////////////////////////////////////////////////// 75806f32e7eSjoerg// MOVMSK Instructions. 75906f32e7eSjoerg//////////////////////////////////////////////////////////////////////////////// 76006f32e7eSjoerg 76106f32e7eSjoergdef : WriteRes<WriteFMOVMSK, [JFPU0, JFPA, JALU0]> { let Latency = 3; } 76206f32e7eSjoergdef : WriteRes<WriteVecMOVMSK, [JFPU0, JFPA, JALU0]> { let Latency = 3; } 76306f32e7eSjoergdefm : X86WriteResUnsupported<WriteVecMOVMSKY>; 76406f32e7eSjoergdef : WriteRes<WriteMMXMOVMSK, [JFPU0, JFPA, JALU0]> { let Latency = 3; } 76506f32e7eSjoerg 76606f32e7eSjoerg//////////////////////////////////////////////////////////////////////////////// 76706f32e7eSjoerg// AES Instructions. 76806f32e7eSjoerg//////////////////////////////////////////////////////////////////////////////// 76906f32e7eSjoerg 77006f32e7eSjoergdefm : JWriteResFpuPair<WriteAESIMC, [JFPU0, JVIMUL], 2>; 77106f32e7eSjoergdefm : JWriteResFpuPair<WriteAESKeyGen, [JFPU0, JVIMUL], 2>; 77206f32e7eSjoergdefm : JWriteResFpuPair<WriteAESDecEnc, [JFPU01, JVALU, JFPU0, JVIMUL], 3, [1,1,1,1], 2>; 77306f32e7eSjoerg 77406f32e7eSjoerg//////////////////////////////////////////////////////////////////////////////// 77506f32e7eSjoerg// Horizontal add/sub instructions. 77606f32e7eSjoerg//////////////////////////////////////////////////////////////////////////////// 77706f32e7eSjoerg 77806f32e7eSjoergdefm : JWriteResFpuPair<WriteFHAdd, [JFPU0, JFPA], 4>; // +1cy latency. 77906f32e7eSjoergdefm : JWriteResYMMPair<WriteFHAddY, [JFPU0, JFPA], 4, [2,2], 2>; // +1cy latency. 78006f32e7eSjoergdefm : JWriteResFpuPair<WritePHAdd, [JFPU01, JVALU], 1>; 78106f32e7eSjoergdefm : JWriteResFpuPair<WritePHAddX, [JFPU01, JVALU], 2>; // +1cy latency. 78206f32e7eSjoergdefm : X86WriteResPairUnsupported<WritePHAddY>; 78306f32e7eSjoerg 78406f32e7eSjoerg//////////////////////////////////////////////////////////////////////////////// 78506f32e7eSjoerg// Carry-less multiplication instructions. 78606f32e7eSjoerg//////////////////////////////////////////////////////////////////////////////// 78706f32e7eSjoerg 78806f32e7eSjoergdefm : JWriteResFpuPair<WriteCLMul, [JFPU0, JVIMUL], 2>; 78906f32e7eSjoerg 79006f32e7eSjoerg//////////////////////////////////////////////////////////////////////////////// 79106f32e7eSjoerg// SSE4A instructions. 79206f32e7eSjoerg//////////////////////////////////////////////////////////////////////////////// 79306f32e7eSjoerg 79406f32e7eSjoergdef JWriteINSERTQ: SchedWriteRes<[JFPU01, JVALU]> { 79506f32e7eSjoerg let Latency = 2; 79606f32e7eSjoerg let ResourceCycles = [1, 4]; 79706f32e7eSjoerg} 79806f32e7eSjoergdef : InstRW<[JWriteINSERTQ], (instrs INSERTQ, INSERTQI)>; 79906f32e7eSjoerg 80006f32e7eSjoerg//////////////////////////////////////////////////////////////////////////////// 80106f32e7eSjoerg// AVX instructions. 80206f32e7eSjoerg//////////////////////////////////////////////////////////////////////////////// 80306f32e7eSjoerg 80406f32e7eSjoergdef JWriteVecExtractF128: SchedWriteRes<[JFPU01, JFPX]>; 80506f32e7eSjoergdef : InstRW<[JWriteVecExtractF128], (instrs VEXTRACTF128rr)>; 80606f32e7eSjoerg 80706f32e7eSjoergdef JWriteVBROADCASTYLd: SchedWriteRes<[JLAGU, JFPU01, JFPX]> { 80806f32e7eSjoerg let Latency = 6; 80906f32e7eSjoerg let ResourceCycles = [1, 2, 4]; 81006f32e7eSjoerg let NumMicroOps = 2; 81106f32e7eSjoerg} 81206f32e7eSjoergdef : InstRW<[JWriteVBROADCASTYLd], (instrs VBROADCASTSDYrm, 81306f32e7eSjoerg VBROADCASTSSYrm, 81406f32e7eSjoerg VBROADCASTF128)>; 81506f32e7eSjoerg 81606f32e7eSjoergdef JWriteJVZEROALL: SchedWriteRes<[]> { 81706f32e7eSjoerg let Latency = 90; 81806f32e7eSjoerg let NumMicroOps = 73; 81906f32e7eSjoerg} 82006f32e7eSjoergdef : InstRW<[JWriteJVZEROALL], (instrs VZEROALL)>; 82106f32e7eSjoerg 82206f32e7eSjoergdef JWriteJVZEROUPPER: SchedWriteRes<[]> { 82306f32e7eSjoerg let Latency = 46; 82406f32e7eSjoerg let NumMicroOps = 37; 82506f32e7eSjoerg} 82606f32e7eSjoergdef : InstRW<[JWriteJVZEROUPPER], (instrs VZEROUPPER)>; 82706f32e7eSjoerg 82806f32e7eSjoerg/////////////////////////////////////////////////////////////////////////////// 82906f32e7eSjoerg// SSE2/AVX Store Selected Bytes of Double Quadword - (V)MASKMOVDQ 83006f32e7eSjoerg/////////////////////////////////////////////////////////////////////////////// 83106f32e7eSjoerg 83206f32e7eSjoergdef JWriteMASKMOVDQU: SchedWriteRes<[JFPU0, JFPA, JFPU1, JSTC, JLAGU, JSAGU, JALU01]> { 83306f32e7eSjoerg let Latency = 34; 83406f32e7eSjoerg let ResourceCycles = [1, 1, 2, 2, 2, 16, 42]; 83506f32e7eSjoerg let NumMicroOps = 63; 83606f32e7eSjoerg} 83706f32e7eSjoergdef : InstRW<[JWriteMASKMOVDQU], (instrs MASKMOVDQU, MASKMOVDQU64, 83806f32e7eSjoerg VMASKMOVDQU, VMASKMOVDQU64)>; 83906f32e7eSjoerg 84006f32e7eSjoerg/////////////////////////////////////////////////////////////////////////////// 84106f32e7eSjoerg// SchedWriteVariant definitions. 84206f32e7eSjoerg/////////////////////////////////////////////////////////////////////////////// 84306f32e7eSjoerg 84406f32e7eSjoergdef JWriteZeroLatency : SchedWriteRes<[]> { 84506f32e7eSjoerg let Latency = 0; 84606f32e7eSjoerg} 84706f32e7eSjoerg 84806f32e7eSjoergdef JWriteZeroIdiomYmm : SchedWriteRes<[JFPU01, JFPX]> { 84906f32e7eSjoerg let NumMicroOps = 2; 85006f32e7eSjoerg} 85106f32e7eSjoerg 85206f32e7eSjoerg// Certain instructions that use the same register for both source 85306f32e7eSjoerg// operands do not have a real dependency on the previous contents of the 85406f32e7eSjoerg// register, and thus, do not have to wait before completing. They can be 85506f32e7eSjoerg// optimized out at register renaming stage. 85606f32e7eSjoerg// Reference: Section 10.8 of the "Software Optimization Guide for AMD Family 85706f32e7eSjoerg// 15h Processors". 85806f32e7eSjoerg// Reference: Agner's Fog "The microarchitecture of Intel, AMD and VIA CPUs", 85906f32e7eSjoerg// Section 21.8 [Dependency-breaking instructions]. 86006f32e7eSjoerg 86106f32e7eSjoergdef JWriteZeroIdiom : SchedWriteVariant<[ 86206f32e7eSjoerg SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [JWriteZeroLatency]>, 86306f32e7eSjoerg SchedVar<NoSchedPred, [WriteALU]> 86406f32e7eSjoerg]>; 86506f32e7eSjoergdef : InstRW<[JWriteZeroIdiom], (instrs SUB32rr, SUB64rr, 86606f32e7eSjoerg XOR32rr, XOR64rr)>; 86706f32e7eSjoerg 86806f32e7eSjoergdef JWriteFZeroIdiom : SchedWriteVariant<[ 86906f32e7eSjoerg SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [JWriteZeroLatency]>, 87006f32e7eSjoerg SchedVar<NoSchedPred, [WriteFLogic]> 87106f32e7eSjoerg]>; 87206f32e7eSjoergdef : InstRW<[JWriteFZeroIdiom], (instrs XORPSrr, VXORPSrr, XORPDrr, VXORPDrr, 87306f32e7eSjoerg ANDNPSrr, VANDNPSrr, 87406f32e7eSjoerg ANDNPDrr, VANDNPDrr)>; 87506f32e7eSjoerg 87606f32e7eSjoergdef JWriteFZeroIdiomY : SchedWriteVariant<[ 87706f32e7eSjoerg SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [JWriteZeroIdiomYmm]>, 87806f32e7eSjoerg SchedVar<NoSchedPred, [WriteFLogicY]> 87906f32e7eSjoerg]>; 88006f32e7eSjoergdef : InstRW<[JWriteFZeroIdiomY], (instrs VXORPSYrr, VXORPDYrr, 88106f32e7eSjoerg VANDNPSYrr, VANDNPDYrr)>; 88206f32e7eSjoerg 88306f32e7eSjoergdef JWriteVZeroIdiomLogic : SchedWriteVariant<[ 88406f32e7eSjoerg SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [JWriteZeroLatency]>, 88506f32e7eSjoerg SchedVar<NoSchedPred, [WriteVecLogic]> 88606f32e7eSjoerg]>; 88706f32e7eSjoergdef : InstRW<[JWriteVZeroIdiomLogic], (instrs MMX_PXORirr, MMX_PANDNirr)>; 88806f32e7eSjoerg 88906f32e7eSjoergdef JWriteVZeroIdiomLogicX : SchedWriteVariant<[ 89006f32e7eSjoerg SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [JWriteZeroLatency]>, 89106f32e7eSjoerg SchedVar<NoSchedPred, [WriteVecLogicX]> 89206f32e7eSjoerg]>; 89306f32e7eSjoergdef : InstRW<[JWriteVZeroIdiomLogicX], (instrs PXORrr, VPXORrr, 89406f32e7eSjoerg PANDNrr, VPANDNrr)>; 89506f32e7eSjoerg 89606f32e7eSjoergdef JWriteVZeroIdiomALU : SchedWriteVariant<[ 89706f32e7eSjoerg SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [JWriteZeroLatency]>, 89806f32e7eSjoerg SchedVar<NoSchedPred, [WriteVecALU]> 89906f32e7eSjoerg]>; 90006f32e7eSjoergdef : InstRW<[JWriteVZeroIdiomALU], (instrs MMX_PSUBBirr, MMX_PSUBDirr, 90106f32e7eSjoerg MMX_PSUBQirr, MMX_PSUBWirr, 90206f32e7eSjoerg MMX_PSUBSBirr, MMX_PSUBSWirr, 90306f32e7eSjoerg MMX_PSUBUSBirr, MMX_PSUBUSWirr, 90406f32e7eSjoerg MMX_PCMPGTBirr, MMX_PCMPGTDirr, 90506f32e7eSjoerg MMX_PCMPGTWirr)>; 90606f32e7eSjoerg 90706f32e7eSjoergdef JWriteVZeroIdiomALUX : SchedWriteVariant<[ 90806f32e7eSjoerg SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [JWriteZeroLatency]>, 90906f32e7eSjoerg SchedVar<NoSchedPred, [WriteVecALUX]> 91006f32e7eSjoerg]>; 91106f32e7eSjoergdef : InstRW<[JWriteVZeroIdiomALUX], (instrs PSUBBrr, VPSUBBrr, 91206f32e7eSjoerg PSUBDrr, VPSUBDrr, 91306f32e7eSjoerg PSUBQrr, VPSUBQrr, 91406f32e7eSjoerg PSUBWrr, VPSUBWrr, 91506f32e7eSjoerg PSUBSBrr, VPSUBSBrr, 91606f32e7eSjoerg PSUBSWrr, VPSUBSWrr, 91706f32e7eSjoerg PSUBUSBrr, VPSUBUSBrr, 91806f32e7eSjoerg PSUBUSWrr, VPSUBUSWrr, 91906f32e7eSjoerg PCMPGTBrr, VPCMPGTBrr, 92006f32e7eSjoerg PCMPGTDrr, VPCMPGTDrr, 92106f32e7eSjoerg PCMPGTQrr, VPCMPGTQrr, 92206f32e7eSjoerg PCMPGTWrr, VPCMPGTWrr)>; 92306f32e7eSjoerg 92406f32e7eSjoergdef JWriteVPERM2F128 : SchedWriteVariant<[ 92506f32e7eSjoerg SchedVar<MCSchedPredicate<ZeroIdiomVPERMPredicate>, [JWriteZeroIdiomYmm]>, 92606f32e7eSjoerg SchedVar<NoSchedPred, [WriteFShuffle256]> 92706f32e7eSjoerg]>; 92806f32e7eSjoergdef : InstRW<[JWriteVPERM2F128], (instrs VPERM2F128rr)>; 92906f32e7eSjoerg 93006f32e7eSjoerg// This write is used for slow LEA instructions. 93106f32e7eSjoergdef JWrite3OpsLEA : SchedWriteRes<[JALU1, JSAGU]> { 93206f32e7eSjoerg let Latency = 2; 93306f32e7eSjoerg} 93406f32e7eSjoerg 93506f32e7eSjoerg// On Jaguar, a slow LEA is either a 3Ops LEA (base, index, offset), or an LEA 93606f32e7eSjoerg// with a `Scale` value different than 1. 93706f32e7eSjoergdef JSlowLEAPredicate : MCSchedPredicate< 93806f32e7eSjoerg CheckAny<[ 93906f32e7eSjoerg // A 3-operand LEA (base, index, offset). 94006f32e7eSjoerg IsThreeOperandsLEAFn, 94106f32e7eSjoerg // An LEA with a "Scale" different than 1. 94206f32e7eSjoerg CheckAll<[ 94306f32e7eSjoerg CheckIsImmOperand<2>, 94406f32e7eSjoerg CheckNot<CheckImmOperand<2, 1>> 94506f32e7eSjoerg ]> 94606f32e7eSjoerg ]> 94706f32e7eSjoerg>; 94806f32e7eSjoerg 94906f32e7eSjoergdef JWriteLEA : SchedWriteVariant<[ 95006f32e7eSjoerg SchedVar<JSlowLEAPredicate, [JWrite3OpsLEA]>, 95106f32e7eSjoerg SchedVar<NoSchedPred, [WriteLEA]> 95206f32e7eSjoerg]>; 95306f32e7eSjoerg 95406f32e7eSjoergdef : InstRW<[JWriteLEA], (instrs LEA32r, LEA64r, LEA64_32r)>; 95506f32e7eSjoerg 95606f32e7eSjoergdef JSlowLEA16r : SchedWriteRes<[JALU01]> { 95706f32e7eSjoerg let Latency = 3; 95806f32e7eSjoerg let ResourceCycles = [4]; 95906f32e7eSjoerg} 96006f32e7eSjoerg 96106f32e7eSjoergdef : InstRW<[JSlowLEA16r], (instrs LEA16r)>; 96206f32e7eSjoerg 96306f32e7eSjoerg/////////////////////////////////////////////////////////////////////////////// 96406f32e7eSjoerg// Dependency breaking instructions. 96506f32e7eSjoerg/////////////////////////////////////////////////////////////////////////////// 96606f32e7eSjoerg 96706f32e7eSjoergdef : IsZeroIdiomFunction<[ 96806f32e7eSjoerg // GPR Zero-idioms. 96906f32e7eSjoerg DepBreakingClass<[ SUB32rr, SUB64rr, XOR32rr, XOR64rr ], ZeroIdiomPredicate>, 97006f32e7eSjoerg 97106f32e7eSjoerg // MMX Zero-idioms. 97206f32e7eSjoerg DepBreakingClass<[ 97306f32e7eSjoerg MMX_PXORirr, MMX_PANDNirr, MMX_PSUBBirr, 97406f32e7eSjoerg MMX_PSUBDirr, MMX_PSUBQirr, MMX_PSUBWirr, 97506f32e7eSjoerg MMX_PSUBSBirr, MMX_PSUBSWirr, MMX_PSUBUSBirr, MMX_PSUBUSWirr, 97606f32e7eSjoerg MMX_PCMPGTBirr, MMX_PCMPGTDirr, MMX_PCMPGTWirr 97706f32e7eSjoerg ], ZeroIdiomPredicate>, 97806f32e7eSjoerg 97906f32e7eSjoerg // SSE Zero-idioms. 98006f32e7eSjoerg DepBreakingClass<[ 98106f32e7eSjoerg // fp variants. 98206f32e7eSjoerg XORPSrr, XORPDrr, ANDNPSrr, ANDNPDrr, 98306f32e7eSjoerg 98406f32e7eSjoerg // int variants. 98506f32e7eSjoerg PXORrr, PANDNrr, 98606f32e7eSjoerg PSUBBrr, PSUBWrr, PSUBDrr, PSUBQrr, 98706f32e7eSjoerg PSUBSBrr, PSUBSWrr, PSUBUSBrr, PSUBUSWrr, 98806f32e7eSjoerg PCMPGTBrr, PCMPGTDrr, PCMPGTQrr, PCMPGTWrr 98906f32e7eSjoerg ], ZeroIdiomPredicate>, 99006f32e7eSjoerg 99106f32e7eSjoerg // AVX Zero-idioms. 99206f32e7eSjoerg DepBreakingClass<[ 99306f32e7eSjoerg // xmm fp variants. 99406f32e7eSjoerg VXORPSrr, VXORPDrr, VANDNPSrr, VANDNPDrr, 99506f32e7eSjoerg 99606f32e7eSjoerg // xmm int variants. 99706f32e7eSjoerg VPXORrr, VPANDNrr, 99806f32e7eSjoerg VPSUBBrr, VPSUBWrr, VPSUBDrr, VPSUBQrr, 99906f32e7eSjoerg VPSUBSBrr, VPSUBSWrr, VPSUBUSBrr, VPSUBUSWrr, 100006f32e7eSjoerg VPCMPGTBrr, VPCMPGTWrr, VPCMPGTDrr, VPCMPGTQrr, 100106f32e7eSjoerg 100206f32e7eSjoerg // ymm variants. 100306f32e7eSjoerg VXORPSYrr, VXORPDYrr, VANDNPSYrr, VANDNPDYrr 100406f32e7eSjoerg ], ZeroIdiomPredicate>, 100506f32e7eSjoerg 100606f32e7eSjoerg DepBreakingClass<[ VPERM2F128rr ], ZeroIdiomVPERMPredicate> 100706f32e7eSjoerg]>; 100806f32e7eSjoerg 100906f32e7eSjoergdef : IsDepBreakingFunction<[ 101006f32e7eSjoerg // GPR 101106f32e7eSjoerg DepBreakingClass<[ SBB32rr, SBB64rr ], ZeroIdiomPredicate>, 101206f32e7eSjoerg DepBreakingClass<[ CMP32rr, CMP64rr ], CheckSameRegOperand<0, 1> >, 101306f32e7eSjoerg 101406f32e7eSjoerg // MMX 101506f32e7eSjoerg DepBreakingClass<[ 101606f32e7eSjoerg MMX_PCMPEQBirr, MMX_PCMPEQDirr, MMX_PCMPEQWirr 101706f32e7eSjoerg ], ZeroIdiomPredicate>, 101806f32e7eSjoerg 101906f32e7eSjoerg // SSE 102006f32e7eSjoerg DepBreakingClass<[ 102106f32e7eSjoerg PCMPEQBrr, PCMPEQWrr, PCMPEQDrr, PCMPEQQrr 102206f32e7eSjoerg ], ZeroIdiomPredicate>, 102306f32e7eSjoerg 102406f32e7eSjoerg // AVX 102506f32e7eSjoerg DepBreakingClass<[ 102606f32e7eSjoerg VPCMPEQBrr, VPCMPEQWrr, VPCMPEQDrr, VPCMPEQQrr 102706f32e7eSjoerg ], ZeroIdiomPredicate> 102806f32e7eSjoerg]>; 102906f32e7eSjoerg 103006f32e7eSjoergdef : IsOptimizableRegisterMove<[ 103106f32e7eSjoerg InstructionEquivalenceClass<[ 103206f32e7eSjoerg // GPR variants. 103306f32e7eSjoerg MOV32rr, MOV64rr, 103406f32e7eSjoerg 103506f32e7eSjoerg // MMX variants. 103606f32e7eSjoerg MMX_MOVQ64rr, 103706f32e7eSjoerg 103806f32e7eSjoerg // SSE variants. 103906f32e7eSjoerg MOVAPSrr, MOVUPSrr, 104006f32e7eSjoerg MOVAPDrr, MOVUPDrr, 104106f32e7eSjoerg MOVDQArr, MOVDQUrr, 104206f32e7eSjoerg 104306f32e7eSjoerg // AVX variants. 104406f32e7eSjoerg VMOVAPSrr, VMOVUPSrr, 104506f32e7eSjoerg VMOVAPDrr, VMOVUPDrr, 104606f32e7eSjoerg VMOVDQArr, VMOVDQUrr 104706f32e7eSjoerg ], TruePred > 104806f32e7eSjoerg]>; 104906f32e7eSjoerg 105006f32e7eSjoerg} // SchedModel 1051