1//=- X86ScheduleZnver3.td - X86 Znver3 Scheduling ------------*- tablegen -*-=// 2// 3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4// See https://llvm.org/LICENSE.txt for license information. 5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6// 7//===----------------------------------------------------------------------===// 8// 9// This file defines the machine model for Znver3 to support instruction 10// scheduling and other instruction cost heuristics. 11// Based on: 12// * AMD Software Optimization Guide for AMD Family 19h Processors. 13// https://www.amd.com/system/files/TechDocs/56665.zip 14// * The microarchitecture of Intel, AMD and VIA CPUs, By Agner Fog 15// http://www.agner.org/optimize/microarchitecture.pdf 16// * AMD Zen 3 Ryzen Deep Dive Review 17// https://www.anandtech.com/show/16214/ 18//===----------------------------------------------------------------------===// 19 20def Znver3Model : SchedMachineModel { 21 // AMD SOG 19h, 2.9.6 Dispatch 22 // The processor may dispatch up to 6 macro ops per cycle 23 // into the execution engine. 24 let IssueWidth = 6; 25 // AMD SOG 19h, 2.10.3 26 // The retire control unit (RCU) tracks the completion status of all 27 // outstanding operations (integer, load/store, and floating-point) and is 28 // the final arbiter for exception processing and recovery. 29 // The unit can receive up to 6 macro ops dispatched per cycle and track up 30 // to 256 macro ops in-flight in non-SMT mode or 128 per thread in SMT mode. 31 let MicroOpBufferSize = 256; 32 // AMD SOG 19h, 2.9.1 Op Cache 33 // The op cache is organized as an associative cache with 64 sets and 8 ways. 34 // At each set-way intersection is an entry containing up to 8 macro ops. 35 // The maximum capacity of the op cache is 4K ops. 36 // Agner, 22.5 µop cache 37 // The size of the µop cache is big enough for holding most critical loops. 38 // FIXME: PR50584: MachineScheduler/PostRAScheduler have quadradic complexity, 39 // with large values here the compilation of certain loops 40 // ends up taking way too long. 41 // let LoopMicroOpBufferSize = 4096; 42 let LoopMicroOpBufferSize = 512; 43 // AMD SOG 19h, 2.6.2 L1 Data Cache 44 // The L1 data cache has a 4- or 5- cycle integer load-to-use latency. 45 // AMD SOG 19h, 2.12 L1 Data Cache 46 // The AGU and LS pipelines are optimized for simple address generation modes. 47 // <...> and can achieve 4-cycle load-to-use integer load latency. 48 let LoadLatency = 4; 49 // AMD SOG 19h, 2.12 L1 Data Cache 50 // The AGU and LS pipelines are optimized for simple address generation modes. 51 // <...> and can achieve <...> 7-cycle load-to-use FP load latency. 52 int VecLoadLatency = 7; 53 // Latency of a simple store operation. 54 int StoreLatency = 1; 55 // FIXME 56 let HighLatency = 25; // FIXME: any better choice? 57 // AMD SOG 19h, 2.8 Optimizing Branching 58 // The branch misprediction penalty is in the range from 11 to 18 cycles, 59 // <...>. The common case penalty is 13 cycles. 60 let MispredictPenalty = 13; 61 62 let PostRAScheduler = 1; // Enable Post RegAlloc Scheduler pass. 63 64 let CompleteModel = 1; 65} 66 67let SchedModel = Znver3Model in { 68 69 70//===----------------------------------------------------------------------===// 71// RCU 72//===----------------------------------------------------------------------===// 73 74// AMD SOG 19h, 2.10.3 Retire Control Unit 75// The unit can receive up to 6 macro ops dispatched per cycle and track up to 76// 256 macro ops in-flight in non-SMT mode or 128 per thread in SMT mode. <...> 77// The retire unit handles in-order commit of up to eight macro ops per cycle. 78def Zn3RCU : RetireControlUnit<Znver3Model.MicroOpBufferSize, 8>; 79 80//===----------------------------------------------------------------------===// 81// Units 82//===----------------------------------------------------------------------===// 83 84// There are total of three Units, each one with it's own schedulers. 85 86//===----------------------------------------------------------------------===// 87// Integer Execution Unit 88// 89 90// AMD SOG 19h, 2.4 Superscalar Organization 91// The processor uses four decoupled independent integer scheduler queues, 92// each one servicing one ALU pipeline and one or two other pipelines 93 94// 95// Execution pipes 96//===----------------------------------------------------------------------===// 97 98// AMD SOG 19h, 2.10.2 Execution Units 99// The processor contains 4 general purpose integer execution pipes. 100// Each pipe has an ALU capable of general purpose integer operations. 101def Zn3ALU0 : ProcResource<1>; 102def Zn3ALU1 : ProcResource<1>; 103def Zn3ALU2 : ProcResource<1>; 104def Zn3ALU3 : ProcResource<1>; 105 106// AMD SOG 19h, 2.10.2 Execution Units 107// There is also a separate branch execution unit. 108def Zn3BRU1 : ProcResource<1>; 109 110// AMD SOG 19h, 2.10.2 Execution Units 111// There are three Address Generation Units (AGUs) for all load and store 112// address generation. There are also 3 store data movement units 113// associated with the same schedulers as the AGUs. 114def Zn3AGU0 : ProcResource<1>; 115def Zn3AGU1 : ProcResource<1>; 116def Zn3AGU2 : ProcResource<1>; 117 118// 119// Execution Units 120//===----------------------------------------------------------------------===// 121 122// AMD SOG 19h, 2.10.2 Execution Units 123// ALU0 additionally has divide <...> execution capability. 124defvar Zn3Divider = Zn3ALU0; 125 126// AMD SOG 19h, 2.10.2 Execution Units 127// ALU0 additionally has <...> branch execution capability. 128defvar Zn3BRU0 = Zn3ALU0; 129 130// Integer Multiplication issued on ALU1. 131defvar Zn3Multiplier = Zn3ALU1; 132 133// Execution pipeline grouping 134//===----------------------------------------------------------------------===// 135 136// General ALU operations 137def Zn3ALU0123 : ProcResGroup<[Zn3ALU0, Zn3ALU1, Zn3ALU2, Zn3ALU3]>; 138 139// General AGU operations 140def Zn3AGU012 : ProcResGroup<[Zn3AGU0, Zn3AGU1, Zn3AGU2]>; 141 142// Control flow: jumps, calls 143def Zn3BRU01 : ProcResGroup<[Zn3BRU0, Zn3BRU1]>; 144 145// Everything that isn't control flow, but still needs to access CC register, 146// namely: conditional moves, SETcc. 147def Zn3ALU03 : ProcResGroup<[Zn3ALU0, Zn3ALU3]>; 148 149// Zn3ALU1 handles complex bit twiddling: CRC/PDEP/PEXT 150 151// Simple bit twiddling: bit test, shift/rotate, bit extraction 152def Zn3ALU12 : ProcResGroup<[Zn3ALU1, Zn3ALU2]>; 153 154 155// 156// Scheduling 157//===----------------------------------------------------------------------===// 158 159// AMD SOG 19h, 2.10.3 Retire Control Unit 160// The integer physical register file (PRF) consists of 192 registers. 161def Zn3IntegerPRF : RegisterFile<192, [GR64, CCR], [1, 1], [1, 0], 162 6, // Max moves that can be eliminated per cycle. 163 0>; // Restrict move elimination to zero regs. 164 165// anandtech, The integer scheduler has a 4*24 entry macro op capacity. 166// AMD SOG 19h, 2.10.1 Schedulers 167// The schedulers can receive up to six macro ops per cycle, with a limit of 168// two per scheduler. Each scheduler can issue one micro op per cycle into 169// each of its associated pipelines 170// FIXME: these are 4 separate schedulers, not a single big one. 171def Zn3Int : ProcResGroup<[Zn3ALU0, Zn3AGU0, Zn3BRU0, // scheduler 0 172 Zn3ALU1, Zn3AGU1, // scheduler 1 173 Zn3ALU2, Zn3AGU2, // scheduler 2 174 Zn3ALU3, Zn3BRU1 // scheduler 3 175 ]> { 176 let BufferSize = !mul(4, 24); 177} 178 179 180//===----------------------------------------------------------------------===// 181// Floating-Point Unit 182// 183 184// AMD SOG 19h, 2.4 Superscalar Organization 185// The processor uses <...> two decoupled independent floating point schedulers 186// each servicing two FP pipelines and one store or FP-to-integer pipeline. 187 188// 189// Execution pipes 190//===----------------------------------------------------------------------===// 191 192// AMD SOG 19h, 2.10.1 Schedulers 193// <...>, and six FPU pipes. 194// Agner, 22.10 Floating point execution pipes 195// There are six floating point/vector execution pipes, 196def Zn3FPP0 : ProcResource<1>; 197def Zn3FPP1 : ProcResource<1>; 198def Zn3FPP2 : ProcResource<1>; 199def Zn3FPP3 : ProcResource<1>; 200def Zn3FPP45 : ProcResource<2>; 201 202// 203// Execution Units 204//===----------------------------------------------------------------------===// 205// AMD SOG 19h, 2.11.1 Floating Point Execution Resources 206 207// (v)FMUL*, (v)FMA*, Floating Point Compares, Blendv(DQ) 208defvar Zn3FPFMul0 = Zn3FPP0; 209defvar Zn3FPFMul1 = Zn3FPP1; 210 211// (v)FADD* 212defvar Zn3FPFAdd0 = Zn3FPP2; 213defvar Zn3FPFAdd1 = Zn3FPP3; 214 215// All convert operations except pack/unpack 216defvar Zn3FPFCvt0 = Zn3FPP2; 217defvar Zn3FPFCvt1 = Zn3FPP3; 218 219// All Divide and Square Root except Reciprocal Approximation 220// AMD SOG 19h, 2.11.1 Floating Point Execution Resources 221// FDIV unit can support 2 simultaneous operations in flight 222// even though it occupies a single pipe. 223// FIXME: BufferSize=2 ? 224defvar Zn3FPFDiv = Zn3FPP1; 225 226// Moves and Logical operations on Floating Point Data Types 227defvar Zn3FPFMisc0 = Zn3FPP0; 228defvar Zn3FPFMisc1 = Zn3FPP1; 229defvar Zn3FPFMisc2 = Zn3FPP2; 230defvar Zn3FPFMisc3 = Zn3FPP3; 231 232// Integer Adds, Subtracts, and Compares 233// Some complex VADD operations are not available in all pipes. 234defvar Zn3FPVAdd0 = Zn3FPP0; 235defvar Zn3FPVAdd1 = Zn3FPP1; 236defvar Zn3FPVAdd2 = Zn3FPP2; 237defvar Zn3FPVAdd3 = Zn3FPP3; 238 239// Integer Multiplies, SAD, Blendvb 240defvar Zn3FPVMul0 = Zn3FPP0; 241defvar Zn3FPVMul1 = Zn3FPP3; 242 243// Data Shuffles, Packs, Unpacks, Permute 244// Some complex shuffle operations are only available in pipe1. 245defvar Zn3FPVShuf = Zn3FPP1; 246defvar Zn3FPVShufAux = Zn3FPP2; 247 248// Bit Shift Left/Right operations 249defvar Zn3FPVShift0 = Zn3FPP1; 250defvar Zn3FPVShift1 = Zn3FPP2; 251 252// Moves and Logical operations on Packed Integer Data Types 253defvar Zn3FPVMisc0 = Zn3FPP0; 254defvar Zn3FPVMisc1 = Zn3FPP1; 255defvar Zn3FPVMisc2 = Zn3FPP2; 256defvar Zn3FPVMisc3 = Zn3FPP3; 257 258// *AES* 259defvar Zn3FPAES0 = Zn3FPP0; 260defvar Zn3FPAES1 = Zn3FPP1; 261 262// *CLM* 263defvar Zn3FPCLM0 = Zn3FPP0; 264defvar Zn3FPCLM1 = Zn3FPP1; 265 266// Execution pipeline grouping 267//===----------------------------------------------------------------------===// 268 269// AMD SOG 19h, 2.11 Floating-Point Unit 270// Stores and floating point to general purpose register transfer 271// have 2 dedicated pipelines (pipe 5 and 6). 272def Zn3FPU0123 : ProcResGroup<[Zn3FPP0, Zn3FPP1, Zn3FPP2, Zn3FPP3]>; 273 274// (v)FMUL*, (v)FMA*, Floating Point Compares, Blendv(DQ) 275def Zn3FPFMul01 : ProcResGroup<[Zn3FPFMul0, Zn3FPFMul1]>; 276 277// (v)FADD* 278// Some complex VADD operations are not available in all pipes. 279def Zn3FPFAdd01 : ProcResGroup<[Zn3FPFAdd0, Zn3FPFAdd1]>; 280 281// All convert operations except pack/unpack 282def Zn3FPFCvt01 : ProcResGroup<[Zn3FPFCvt0, Zn3FPFCvt1]>; 283 284// All Divide and Square Root except Reciprocal Approximation 285// def Zn3FPFDiv : ProcResGroup<[Zn3FPFDiv]>; 286 287// Moves and Logical operations on Floating Point Data Types 288def Zn3FPFMisc0123 : ProcResGroup<[Zn3FPFMisc0, Zn3FPFMisc1, Zn3FPFMisc2, Zn3FPFMisc3]>; 289 290def Zn3FPFMisc12 : ProcResGroup<[Zn3FPFMisc1, Zn3FPFMisc2]>; 291 292// Loads, Stores and Move to General Register (EX) Operations 293// AMD SOG 19h, 2.11 Floating-Point Unit 294// Stores and floating point to general purpose register transfer 295// have 2 dedicated pipelines (pipe 5 and 6). 296defvar Zn3FPLd01 = Zn3FPP45; 297 298// AMD SOG 19h, 2.11 Floating-Point Unit 299// Note that FP stores are supported on two pipelines, 300// but throughput is limited to one per cycle. 301let Super = Zn3FPP45 in 302def Zn3FPSt : ProcResource<1>; 303 304// Integer Adds, Subtracts, and Compares 305// Some complex VADD operations are not available in all pipes. 306def Zn3FPVAdd0123 : ProcResGroup<[Zn3FPVAdd0, Zn3FPVAdd1, Zn3FPVAdd2, Zn3FPVAdd3]>; 307 308def Zn3FPVAdd01: ProcResGroup<[Zn3FPVAdd0, Zn3FPVAdd1]>; 309def Zn3FPVAdd12: ProcResGroup<[Zn3FPVAdd1, Zn3FPVAdd2]>; 310 311// Integer Multiplies, SAD, Blendvb 312def Zn3FPVMul01 : ProcResGroup<[Zn3FPVMul0, Zn3FPVMul1]>; 313 314// Data Shuffles, Packs, Unpacks, Permute 315// Some complex shuffle operations are only available in pipe1. 316def Zn3FPVShuf01 : ProcResGroup<[Zn3FPVShuf, Zn3FPVShufAux]>; 317 318// Bit Shift Left/Right operations 319def Zn3FPVShift01 : ProcResGroup<[Zn3FPVShift0, Zn3FPVShift1]>; 320 321// Moves and Logical operations on Packed Integer Data Types 322def Zn3FPVMisc0123 : ProcResGroup<[Zn3FPVMisc0, Zn3FPVMisc1, Zn3FPVMisc2, Zn3FPVMisc3]>; 323 324// *AES* 325def Zn3FPAES01 : ProcResGroup<[Zn3FPAES0, Zn3FPAES1]>; 326 327// *CLM* 328def Zn3FPCLM01 : ProcResGroup<[Zn3FPCLM0, Zn3FPCLM1]>; 329 330 331// 332// Scheduling 333//===----------------------------------------------------------------------===// 334 335// Agner, 21.8 Register renaming and out-of-order schedulers 336// The floating point register file has 160 vector registers 337// of 128 bits each in Zen 1 and 256 bits each in Zen 2. 338// anandtech also confirms this. 339def Zn3FpPRF : RegisterFile<160, [VR64, VR128, VR256], [1, 1, 1], [0, 1, 1], 340 6, // Max moves that can be eliminated per cycle. 341 0>; // Restrict move elimination to zero regs. 342 343// AMD SOG 19h, 2.11 Floating-Point Unit 344// The floating-point scheduler has a 2*32 entry macro op capacity. 345// AMD SOG 19h, 2.11 Floating-Point Unit 346// <...> the scheduler can issue 1 micro op per cycle for each pipe. 347// FIXME: those are two separate schedulers, not a single big one. 348def Zn3FP : ProcResGroup<[Zn3FPP0, Zn3FPP2, /*Zn3FPP4,*/ // scheduler 0 349 Zn3FPP1, Zn3FPP3, Zn3FPP45 /*Zn3FPP5*/ // scheduler 1 350 ]> { 351 let BufferSize = !mul(2, 32); 352} 353 354// AMD SOG 19h, 2.11 Floating-Point Unit 355// Macro ops can be dispatched to the 64 entry Non Scheduling Queue (NSQ) 356// even if floating-point scheduler is full. 357// FIXME: how to model this properly? 358 359 360//===----------------------------------------------------------------------===// 361// Load-Store Unit 362// 363 364// AMD SOG 19h, 2.12 Load-Store Unit 365// The LS unit contains three largely independent pipe-lines 366// enabling the execution of three 256-bit memory operations per cycle. 367def Zn3LSU : ProcResource<3>; 368 369// AMD SOG 19h, 2.12 Load-Store Unit 370// All three memory operations can be loads. 371let Super = Zn3LSU in 372def Zn3Load : ProcResource<3> { 373 // AMD SOG 19h, 2.12 Load-Store Unit 374 // The LS unit can process up to 72 out-of-order loads. 375 let BufferSize = 72; 376} 377 378def Zn3LoadQueue : LoadQueue<Zn3Load>; 379 380// AMD SOG 19h, 2.12 Load-Store Unit 381// A maximum of two of the memory operations can be stores. 382let Super = Zn3LSU in 383def Zn3Store : ProcResource<2> { 384 // AMD SOG 19h, 2.12 Load-Store Unit 385 // The LS unit utilizes a 64-entry store queue (STQ). 386 let BufferSize = 64; 387} 388 389def Zn3StoreQueue : StoreQueue<Zn3Store>; 390 391//===----------------------------------------------------------------------===// 392// Basic helper classes. 393//===----------------------------------------------------------------------===// 394 395// Many SchedWrites are defined in pairs with and without a folded load. 396// Instructions with folded loads are usually micro-fused, so they only appear 397// as two micro-ops when dispatched by the schedulers. 398// This multiclass defines the resource usage for variants with and without 399// folded loads. 400 401multiclass __zn3WriteRes<SchedWrite SchedRW, list<ProcResourceKind> ExePorts, 402 int Lat = 1, list<int> Res = [], int UOps = 1> { 403 def : WriteRes<SchedRW, ExePorts> { 404 let Latency = Lat; 405 let ResourceCycles = Res; 406 let NumMicroOps = UOps; 407 } 408} 409 410multiclass __zn3WriteResPair<X86FoldableSchedWrite SchedRW, 411 list<ProcResourceKind> ExePorts, int Lat, 412 list<int> Res, int UOps, int LoadLat, int LoadUOps, 413 ProcResourceKind AGU, int LoadRes> { 414 defm : __zn3WriteRes<SchedRW, ExePorts, Lat, Res, UOps>; 415 416 defm : __zn3WriteRes<SchedRW.Folded, 417 !listconcat([AGU, Zn3Load], ExePorts), 418 !add(Lat, LoadLat), 419 !if(!and(!empty(Res), !eq(LoadRes, 1)), 420 [], 421 !listconcat([1, LoadRes], 422 !if(!empty(Res), 423 !listsplat(1, !size(ExePorts)), 424 Res))), 425 !add(UOps, LoadUOps)>; 426} 427 428// For classes without folded loads. 429multiclass Zn3WriteResInt<SchedWrite SchedRW, 430 list<ProcResourceKind> ExePorts, int Lat = 1, 431 list<int> Res = [], int UOps = 1> { 432 defm : __zn3WriteRes<SchedRW, ExePorts, Lat, Res, UOps>; 433} 434 435multiclass Zn3WriteResXMM<SchedWrite SchedRW, 436 list<ProcResourceKind> ExePorts, int Lat = 1, 437 list<int> Res = [], int UOps = 1> { 438 defm : __zn3WriteRes<SchedRW, ExePorts, Lat, Res, UOps>; 439} 440 441multiclass Zn3WriteResYMM<SchedWrite SchedRW, 442 list<ProcResourceKind> ExePorts, int Lat = 1, 443 list<int> Res = [], int UOps = 1> { 444 defm : __zn3WriteRes<SchedRW, ExePorts, Lat, Res, UOps>; 445} 446 447// For classes with folded loads. 448multiclass Zn3WriteResIntPair<X86FoldableSchedWrite SchedRW, 449 list<ProcResourceKind> ExePorts, int Lat = 1, 450 list<int> Res = [], int UOps = 1, 451 int LoadUOps = 0, int LoadRes = 1> { 452 defm : __zn3WriteResPair<SchedRW, ExePorts, Lat, Res, UOps, 453 Znver3Model.LoadLatency, 454 LoadUOps, Zn3AGU012, LoadRes>; 455} 456 457multiclass Zn3WriteResXMMPair<X86FoldableSchedWrite SchedRW, 458 list<ProcResourceKind> ExePorts, int Lat = 1, 459 list<int> Res = [], int UOps = 1, 460 int LoadUOps = 0, int LoadRes = 1> { 461 defm : __zn3WriteResPair<SchedRW, ExePorts, Lat, Res, UOps, 462 Znver3Model.VecLoadLatency, 463 LoadUOps, Zn3FPLd01, LoadRes>; 464} 465 466multiclass Zn3WriteResYMMPair<X86FoldableSchedWrite SchedRW, 467 list<ProcResourceKind> ExePorts, int Lat = 1, 468 list<int> Res = [], int UOps = 1, 469 int LoadUOps = 0, int LoadRes = 1> { 470 defm : __zn3WriteResPair<SchedRW, ExePorts, Lat, Res, UOps, 471 Znver3Model.VecLoadLatency, 472 LoadUOps, Zn3FPLd01, LoadRes>; 473} 474 475 476//===----------------------------------------------------------------------===// 477// Here be dragons. 478//===----------------------------------------------------------------------===// 479 480def : ReadAdvance<ReadAfterLd, Znver3Model.LoadLatency>; 481 482def : ReadAdvance<ReadAfterVecLd, Znver3Model.VecLoadLatency>; 483def : ReadAdvance<ReadAfterVecXLd, Znver3Model.VecLoadLatency>; 484def : ReadAdvance<ReadAfterVecYLd, Znver3Model.VecLoadLatency>; 485 486// AMD SOG 19h, 2.11 Floating-Point Unit 487// There is 1 cycle of added latency for a result to cross 488// from F to I or I to F domain. 489def : ReadAdvance<ReadInt2Fpu, -1>; 490 491// Instructions with both a load and a store folded are modeled as a folded 492// load + WriteRMW. 493defm : Zn3WriteResInt<WriteRMW, [Zn3AGU012, Zn3Store], Znver3Model.StoreLatency, [1, 1], 0>; 494 495// Loads, stores, and moves, not folded with other operations. 496defm : Zn3WriteResInt<WriteLoad, [Zn3AGU012, Zn3Load], !add(Znver3Model.LoadLatency, 1), [1, 1], 1>; 497 498// Model the effect of clobbering the read-write mask operand of the GATHER operation. 499// Does not cost anything by itself, only has latency, matching that of the WriteLoad, 500defm : Zn3WriteResInt<WriteVecMaskedGatherWriteback, [], !add(Znver3Model.LoadLatency, 1), [], 0>; 501 502def Zn3WriteMOVSlow : SchedWriteRes<[Zn3AGU012, Zn3Load]> { 503 let Latency = !add(Znver3Model.LoadLatency, 1); 504 let ResourceCycles = [3, 1]; 505 let NumMicroOps = 1; 506} 507def : InstRW<[Zn3WriteMOVSlow], (instrs MOV8rm, MOV8rm_NOREX, MOV16rm, MOVSX16rm16, MOVSX16rm32, MOVZX16rm16, MOVSX16rm8, MOVZX16rm8)>; 508 509defm : Zn3WriteResInt<WriteStore, [Zn3AGU012, Zn3Store], Znver3Model.StoreLatency, [1, 2], 1>; 510defm : Zn3WriteResInt<WriteStoreNT, [Zn3AGU012, Zn3Store], Znver3Model.StoreLatency, [1, 2], 1>; 511defm : Zn3WriteResInt<WriteMove, [Zn3ALU0123], 1, [4], 1>; 512 513// Treat misc copies as a move. 514def : InstRW<[WriteMove], (instrs COPY)>; 515 516def Zn3WriteMOVBE16rm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3ALU0123]> { 517 let Latency = Znver3Model.LoadLatency; 518 let ResourceCycles = [1, 1, 4]; 519 let NumMicroOps = 1; 520} 521def : InstRW<[Zn3WriteMOVBE16rm], (instrs MOVBE16rm)>; 522 523def Zn3WriteMOVBEmr : SchedWriteRes<[Zn3ALU0123, Zn3AGU012, Zn3Store]> { 524 let Latency = Znver3Model.StoreLatency; 525 let ResourceCycles = [4, 1, 1]; 526 let NumMicroOps = 2; 527} 528def : InstRW<[Zn3WriteMOVBEmr], (instrs MOVBE16mr, MOVBE32mr, MOVBE64mr)>; 529 530// Arithmetic. 531defm : Zn3WriteResIntPair<WriteALU, [Zn3ALU0123], 1, [1], 1>; // Simple integer ALU op. 532 533def Zn3WriteALUSlow : SchedWriteRes<[Zn3ALU0123]> { 534 let Latency = 1; 535 let ResourceCycles = [4]; 536 let NumMicroOps = 1; 537} 538def : InstRW<[Zn3WriteALUSlow], (instrs ADD8i8, ADD16i16, ADD32i32, ADD64i32, 539 AND8i8, AND16i16, AND32i32, AND64i32, 540 OR8i8, OR16i16, OR32i32, OR64i32, 541 SUB8i8, SUB16i16, SUB32i32, SUB64i32, 542 XOR8i8, XOR16i16, XOR32i32, XOR64i32)>; 543 544def Zn3WriteMoveExtend : SchedWriteRes<[Zn3ALU0123]> { 545 let Latency = 1; 546 let ResourceCycles = [4]; 547 let NumMicroOps = 1; 548} 549def : InstRW<[Zn3WriteMoveExtend], (instrs MOVSX16rr16, MOVSX16rr32, MOVZX16rr16, MOVSX16rr8, MOVZX16rr8)>; 550 551def Zn3WriteMaterialize32bitImm: SchedWriteRes<[Zn3ALU0123]> { 552 let Latency = 1; 553 let ResourceCycles = [2]; 554 let NumMicroOps = 1; 555} 556def : InstRW<[Zn3WriteMaterialize32bitImm], (instrs MOV32ri, MOV32ri_alt, MOV64ri32)>; 557 558def Zn3WritePDEP_PEXT : SchedWriteRes<[Zn3ALU1]> { 559 let Latency = 3; 560 let ResourceCycles = [1]; 561 let NumMicroOps = 1; 562} 563def : InstRW<[Zn3WritePDEP_PEXT], (instrs PDEP32rr, PDEP64rr, 564 PEXT32rr, PEXT64rr)>; 565 566defm : Zn3WriteResIntPair<WriteADC, [Zn3ALU0123], 1, [4], 1>; // Integer ALU + flags op. 567 568def Zn3WriteADC8mr_SBB8mr : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3ALU0123, Zn3Store]> { 569 let Latency = 1; 570 let ResourceCycles = [1, 1, 7, 1]; 571 let NumMicroOps = 1; 572} 573def : InstRW<[Zn3WriteADC8mr_SBB8mr], (instrs ADC8mr, SBB8mr)>; 574 575// This is for simple LEAs with one or two input operands. 576defm : Zn3WriteResInt<WriteLEA, [Zn3AGU012], 1, [1], 1>; // LEA instructions can't fold loads. 577 578// This write is used for slow LEA instructions. 579def Zn3Write3OpsLEA : SchedWriteRes<[Zn3ALU0123]> { 580 let Latency = 2; 581 let ResourceCycles = [1]; 582 let NumMicroOps = 2; 583} 584 585// On Znver3, a slow LEA is either a 3Ops LEA (base, index, offset), 586// or an LEA with a `Scale` value different than 1. 587def Zn3SlowLEAPredicate : MCSchedPredicate< 588 CheckAny<[ 589 // A 3-operand LEA (base, index, offset). 590 IsThreeOperandsLEAFn, 591 // An LEA with a "Scale" different than 1. 592 CheckAll<[ 593 CheckIsImmOperand<2>, 594 CheckNot<CheckImmOperand<2, 1>> 595 ]> 596 ]> 597>; 598 599def Zn3WriteLEA : SchedWriteVariant<[ 600 SchedVar<Zn3SlowLEAPredicate, [Zn3Write3OpsLEA]>, 601 SchedVar<NoSchedPred, [WriteLEA]> 602]>; 603 604def : InstRW<[Zn3WriteLEA], (instrs LEA32r, LEA64r, LEA64_32r)>; 605 606def Zn3SlowLEA16r : SchedWriteRes<[Zn3ALU0123]> { 607 let Latency = 2; // FIXME: not from llvm-exegesis 608 let ResourceCycles = [4]; 609 let NumMicroOps = 2; 610} 611 612def : InstRW<[Zn3SlowLEA16r], (instrs LEA16r)>; 613 614// Integer multiplication 615defm : Zn3WriteResIntPair<WriteIMul8, [Zn3Multiplier], 3, [3], 1>; // Integer 8-bit multiplication. 616defm : Zn3WriteResIntPair<WriteIMul16, [Zn3Multiplier], 3, [3], 3, /*LoadUOps=*/1>; // Integer 16-bit multiplication. 617defm : Zn3WriteResIntPair<WriteIMul16Imm, [Zn3Multiplier], 4, [4], 2>; // Integer 16-bit multiplication by immediate. 618defm : Zn3WriteResIntPair<WriteIMul16Reg, [Zn3Multiplier], 3, [1], 1>; // Integer 16-bit multiplication by register. 619defm : Zn3WriteResIntPair<WriteIMul32, [Zn3Multiplier], 3, [3], 2>; // Integer 32-bit multiplication. 620 621def Zn3MULX32rr : SchedWriteRes<[Zn3Multiplier]> { 622 let Latency = 4; 623 let ResourceCycles = [1]; 624 let NumMicroOps = 2; 625} 626def : InstRW<[Zn3MULX32rr, WriteIMulH], (instrs MULX32rr)>; 627 628def Zn3MULX32rm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3Multiplier]> { 629 let Latency = !add(Znver3Model.LoadLatency, Zn3MULX32rr.Latency); 630 let ResourceCycles = [1, 1, 2]; 631 let NumMicroOps = Zn3MULX32rr.NumMicroOps; 632} 633def : InstRW<[Zn3MULX32rm, WriteIMulH], (instrs MULX32rm)>; 634 635defm : Zn3WriteResIntPair<WriteIMul32Imm, [Zn3Multiplier], 3, [1], 1>; // Integer 32-bit multiplication by immediate. 636defm : Zn3WriteResIntPair<WriteIMul32Reg, [Zn3Multiplier], 3, [1], 1>; // Integer 32-bit multiplication by register. 637defm : Zn3WriteResIntPair<WriteIMul64, [Zn3Multiplier], 3, [3], 2>; // Integer 64-bit multiplication. 638 639def Zn3MULX64rr : SchedWriteRes<[Zn3Multiplier]> { 640 let Latency = 4; 641 let ResourceCycles = [1]; 642 let NumMicroOps = 2; 643} 644def : InstRW<[Zn3MULX64rr, WriteIMulH], (instrs MULX64rr)>; 645 646def Zn3MULX64rm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3Multiplier]> { 647 let Latency = !add(Znver3Model.LoadLatency, Zn3MULX64rr.Latency); 648 let ResourceCycles = [1, 1, 2]; 649 let NumMicroOps = Zn3MULX64rr.NumMicroOps; 650} 651def : InstRW<[Zn3MULX64rm, WriteIMulH], (instrs MULX64rm)>; 652 653defm : Zn3WriteResIntPair<WriteIMul64Imm, [Zn3Multiplier], 3, [1], 1>; // Integer 64-bit multiplication by immediate. 654defm : Zn3WriteResIntPair<WriteIMul64Reg, [Zn3Multiplier], 3, [1], 1>; // Integer 64-bit multiplication by register. 655defm : Zn3WriteResInt<WriteIMulH, [], 4, [], 0>; // Integer multiplication, high part. 656 657defm : Zn3WriteResInt<WriteBSWAP32, [Zn3ALU0123], 1, [1], 1>; // Byte Order (Endianness) 32-bit Swap. 658defm : Zn3WriteResInt<WriteBSWAP64, [Zn3ALU0123], 1, [1], 1>; // Byte Order (Endianness) 64-bit Swap. 659 660defm : Zn3WriteResIntPair<WriteCMPXCHG, [Zn3ALU0123], 3, [12], 5>; // Compare and set, compare and swap. 661 662def Zn3WriteCMPXCHG8rr : SchedWriteRes<[Zn3ALU0123]> { 663 let Latency = 3; 664 let ResourceCycles = [12]; 665 let NumMicroOps = 3; 666} 667def : InstRW<[Zn3WriteCMPXCHG8rr], (instrs CMPXCHG8rr)>; 668 669defm : Zn3WriteResInt<WriteCMPXCHGRMW, [Zn3ALU0123], 3, [12], 6>; // Compare and set, compare and swap. 670 671def Zn3WriteCMPXCHG8rm_LCMPXCHG8 : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3ALU0123]> { 672 let Latency = !add(Znver3Model.LoadLatency, Zn3WriteCMPXCHG8rr.Latency); 673 let ResourceCycles = [1, 1, 12]; 674 let NumMicroOps = !add(Zn3WriteCMPXCHG8rr.NumMicroOps, 2); 675} 676def : InstRW<[Zn3WriteCMPXCHG8rm_LCMPXCHG8], (instrs CMPXCHG8rm, LCMPXCHG8)>; 677 678def Zn3WriteCMPXCHG8B : SchedWriteRes<[Zn3ALU0123]> { 679 let Latency = 3; // FIXME: not from llvm-exegesis 680 let ResourceCycles = [24]; 681 let NumMicroOps = 19; 682} 683def : InstRW<[Zn3WriteCMPXCHG8B], (instrs CMPXCHG8B)>; 684 685def Zn3WriteCMPXCHG16B_LCMPXCHG16B : SchedWriteRes<[Zn3ALU0123]> { 686 let Latency = 4; // FIXME: not from llvm-exegesis 687 let ResourceCycles = [59]; 688 let NumMicroOps = 28; 689} 690def : InstRW<[Zn3WriteCMPXCHG16B_LCMPXCHG16B], (instrs CMPXCHG16B, LCMPXCHG16B)>; 691 692def Zn3WriteWriteXCHGUnrenameable : SchedWriteRes<[Zn3ALU0123]> { 693 let Latency = 1; 694 let ResourceCycles = [2]; 695 let NumMicroOps = 2; 696} 697def : InstRW<[Zn3WriteWriteXCHGUnrenameable], (instrs XCHG8rr, XCHG16rr, XCHG16ar)>; 698 699def Zn3WriteXCHG8rm_XCHG16rm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3ALU0123]> { 700 let Latency = !add(Znver3Model.LoadLatency, 3); // FIXME: not from llvm-exegesis 701 let ResourceCycles = [1, 1, 2]; 702 let NumMicroOps = 5; 703} 704def : InstRW<[Zn3WriteXCHG8rm_XCHG16rm], (instrs XCHG8rm, XCHG16rm)>; 705 706def Zn3WriteXCHG32rm_XCHG64rm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3ALU0123]> { 707 let Latency = !add(Znver3Model.LoadLatency, 2); // FIXME: not from llvm-exegesis 708 let ResourceCycles = [1, 1, 2]; 709 let NumMicroOps = 2; 710} 711def : InstRW<[Zn3WriteXCHG32rm_XCHG64rm], (instrs XCHG32rm, XCHG64rm)>; 712 713// Integer division. 714// FIXME: uops for 8-bit division measures as 2. for others it's a guess. 715// FIXME: latency for 8-bit division measures as 10. for others it's a guess. 716defm : Zn3WriteResIntPair<WriteDiv8, [Zn3Divider], 10, [10], 2>; 717defm : Zn3WriteResIntPair<WriteDiv16, [Zn3Divider], 11, [11], 2>; 718defm : Zn3WriteResIntPair<WriteDiv32, [Zn3Divider], 13, [13], 2>; 719defm : Zn3WriteResIntPair<WriteDiv64, [Zn3Divider], 17, [17], 2>; 720defm : Zn3WriteResIntPair<WriteIDiv8, [Zn3Divider], 10, [10], 2>; 721defm : Zn3WriteResIntPair<WriteIDiv16, [Zn3Divider], 11, [11], 2>; 722defm : Zn3WriteResIntPair<WriteIDiv32, [Zn3Divider], 13, [13], 2>; 723defm : Zn3WriteResIntPair<WriteIDiv64, [Zn3Divider], 17, [17], 2>; 724 725defm : Zn3WriteResIntPair<WriteBSF, [Zn3ALU1], 3, [3], 6, /*LoadUOps=*/2>; // Bit scan forward. 726defm : Zn3WriteResIntPair<WriteBSR, [Zn3ALU1], 4, [4], 6, /*LoadUOps=*/2>; // Bit scan reverse. 727 728defm : Zn3WriteResIntPair<WritePOPCNT, [Zn3ALU0123], 1, [1], 1>; // Bit population count. 729 730def Zn3WritePOPCNT16rr : SchedWriteRes<[Zn3ALU0123]> { 731 let Latency = 1; 732 let ResourceCycles = [4]; 733 let NumMicroOps = 1; 734} 735def : InstRW<[Zn3WritePOPCNT16rr], (instrs POPCNT16rr)>; 736 737defm : Zn3WriteResIntPair<WriteLZCNT, [Zn3ALU0123], 1, [1], 1>; // Leading zero count. 738 739def Zn3WriteLZCNT16rr : SchedWriteRes<[Zn3ALU0123]> { 740 let Latency = 1; 741 let ResourceCycles = [4]; 742 let NumMicroOps = 1; 743} 744def : InstRW<[Zn3WriteLZCNT16rr], (instrs LZCNT16rr)>; 745 746defm : Zn3WriteResIntPair<WriteTZCNT, [Zn3ALU12], 2, [1], 2>; // Trailing zero count. 747 748def Zn3WriteTZCNT16rr : SchedWriteRes<[Zn3ALU0123]> { 749 let Latency = 2; 750 let ResourceCycles = [4]; 751 let NumMicroOps = 2; 752} 753def : InstRW<[Zn3WriteTZCNT16rr], (instrs TZCNT16rr)>; 754 755defm : Zn3WriteResIntPair<WriteCMOV, [Zn3ALU03], 1, [1], 1>; // Conditional move. 756defm : Zn3WriteResInt<WriteFCMOV, [Zn3ALU0123], 7, [28], 7>; // FIXME: not from llvm-exegesis // X87 conditional move. 757defm : Zn3WriteResInt<WriteSETCC, [Zn3ALU03], 1, [2], 1>; // Set register based on condition code. 758defm : Zn3WriteResInt<WriteSETCCStore, [Zn3ALU03, Zn3AGU012, Zn3Store], 2, [2, 1, 1], 2>; // FIXME: latency not from llvm-exegesis 759defm : Zn3WriteResInt<WriteLAHFSAHF, [Zn3ALU3], 1, [1], 1>; // Load/Store flags in AH. 760 761defm : Zn3WriteResInt<WriteBitTest, [Zn3ALU12], 1, [1], 1>; // Bit Test 762defm : Zn3WriteResInt<WriteBitTestImmLd, [Zn3AGU012, Zn3Load, Zn3ALU12], !add(Znver3Model.LoadLatency, 1), [1, 1, 1], 2>; 763defm : Zn3WriteResInt<WriteBitTestRegLd, [Zn3AGU012, Zn3Load, Zn3ALU12], !add(Znver3Model.LoadLatency, 1), [1, 1, 1], 7>; 764 765defm : Zn3WriteResInt<WriteBitTestSet, [Zn3ALU12], 2, [2], 2>; // Bit Test + Set 766defm : Zn3WriteResInt<WriteBitTestSetImmLd, [Zn3AGU012, Zn3Load, Zn3ALU12], !add(Znver3Model.LoadLatency, 2), [1, 1, 1], 4>; 767defm : Zn3WriteResInt<WriteBitTestSetRegLd, [Zn3AGU012, Zn3Load, Zn3ALU12], !add(Znver3Model.LoadLatency, 2), [1, 1, 1], 9>; 768 769// Integer shifts and rotates. 770defm : Zn3WriteResIntPair<WriteShift, [Zn3ALU12], 1, [1], 1, /*LoadUOps=*/1>; 771defm : Zn3WriteResIntPair<WriteShiftCL, [Zn3ALU12], 1, [1], 1, /*LoadUOps=*/1>; 772defm : Zn3WriteResIntPair<WriteRotate, [Zn3ALU12], 1, [1], 1, /*LoadUOps=*/1>; 773 774def Zn3WriteRotateR1 : SchedWriteRes<[Zn3ALU12]> { 775 let Latency = 1; 776 let ResourceCycles = [2]; 777 let NumMicroOps = 1; 778} 779def : InstRW<[Zn3WriteRotateR1], (instrs RCL8r1, RCL16r1, RCL32r1, RCL64r1, 780 RCR8r1, RCR16r1, RCR32r1, RCR64r1)>; 781 782def Zn3WriteRotateM1 : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3ALU12]> { 783 let Latency = !add(Znver3Model.LoadLatency, Zn3WriteRotateR1.Latency); 784 let ResourceCycles = [1, 1, 2]; 785 let NumMicroOps = !add(Zn3WriteRotateR1.NumMicroOps, 1); 786} 787def : InstRW<[Zn3WriteRotateM1], (instrs RCL8m1, RCL16m1, RCL32m1, RCL64m1, 788 RCR8m1, RCR16m1, RCR32m1, RCR64m1)>; 789 790def Zn3WriteRotateRightRI : SchedWriteRes<[Zn3ALU12]> { 791 let Latency = 3; 792 let ResourceCycles = [6]; 793 let NumMicroOps = 7; 794} 795def : InstRW<[Zn3WriteRotateRightRI], (instrs RCR8ri, RCR16ri, RCR32ri, RCR64ri)>; 796 797def Zn3WriteRotateRightMI : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3ALU12]> { 798 let Latency = !add(Znver3Model.LoadLatency, Zn3WriteRotateRightRI.Latency); 799 let ResourceCycles = [1, 1, 8]; 800 let NumMicroOps = !add(Zn3WriteRotateRightRI.NumMicroOps, 3); 801} 802def : InstRW<[Zn3WriteRotateRightMI], (instrs RCR8mi, RCR16mi, RCR32mi, RCR64mi)>; 803 804def Zn3WriteRotateLeftRI : SchedWriteRes<[Zn3ALU12]> { 805 let Latency = 4; 806 let ResourceCycles = [8]; 807 let NumMicroOps = 9; 808} 809def : InstRW<[Zn3WriteRotateLeftRI], (instrs RCL8ri, RCL16ri, RCL32ri, RCL64ri)>; 810 811def Zn3WriteRotateLeftMI : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3ALU12]> { 812 let Latency = !add(Znver3Model.LoadLatency, Zn3WriteRotateLeftRI.Latency); 813 let ResourceCycles = [1, 1, 8]; 814 let NumMicroOps = !add(Zn3WriteRotateLeftRI.NumMicroOps, 2); 815} 816def : InstRW<[Zn3WriteRotateLeftMI], (instrs RCL8mi, RCL16mi, RCL32mi, RCL64mi)>; 817 818defm : Zn3WriteResIntPair<WriteRotateCL, [Zn3ALU12], 1, [1], 1, /*LoadUOps=*/1>; 819 820def Zn3WriteRotateRightRCL : SchedWriteRes<[Zn3ALU12]> { 821 let Latency = 3; 822 let ResourceCycles = [6]; 823 let NumMicroOps = 7; 824} 825def : InstRW<[Zn3WriteRotateRightRCL], (instrs RCR8rCL, RCR16rCL, RCR32rCL, RCR64rCL)>; 826 827def Zn3WriteRotateRightMCL : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3ALU12]> { 828 let Latency = !add(Znver3Model.LoadLatency, Zn3WriteRotateRightRCL.Latency); 829 let ResourceCycles = [1, 1, 8]; 830 let NumMicroOps = !add(Zn3WriteRotateRightRCL.NumMicroOps, 2); 831} 832def : InstRW<[Zn3WriteRotateRightMCL], (instrs RCR8mCL, RCR16mCL, RCR32mCL, RCR64mCL)>; 833 834def Zn3WriteRotateLeftRCL : SchedWriteRes<[Zn3ALU12]> { 835 let Latency = 4; 836 let ResourceCycles = [8]; 837 let NumMicroOps = 9; 838} 839def : InstRW<[Zn3WriteRotateLeftRCL], (instrs RCL8rCL, RCL16rCL, RCL32rCL, RCL64rCL)>; 840 841def Zn3WriteRotateLeftMCL : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3ALU12]> { 842 let Latency = !add(Znver3Model.LoadLatency, Zn3WriteRotateLeftRCL.Latency); 843 let ResourceCycles = [1, 1, 8]; 844 let NumMicroOps = !add(Zn3WriteRotateLeftRCL.NumMicroOps, 2); 845} 846def : InstRW<[Zn3WriteRotateLeftMCL], (instrs RCL8mCL, RCL16mCL, RCL32mCL, RCL64mCL)>; 847 848// Double shift instructions. 849defm : Zn3WriteResInt<WriteSHDrri, [Zn3ALU12], 2, [3], 4>; 850defm : Zn3WriteResInt<WriteSHDrrcl, [Zn3ALU12], 2, [3], 5>; 851defm : Zn3WriteResInt<WriteSHDmri, [Zn3AGU012, Zn3Load, Zn3ALU12], !add(Znver3Model.LoadLatency, 2), [1, 1, 4], 6>; 852defm : Zn3WriteResInt<WriteSHDmrcl, [Zn3AGU012, Zn3Load, Zn3ALU12], !add(Znver3Model.LoadLatency, 2), [1, 1, 4], 6>; 853 854// BMI1 BEXTR/BLS, BMI2 BZHI 855defm : Zn3WriteResIntPair<WriteBEXTR, [Zn3ALU12], 1, [1], 1, /*LoadUOps=*/1>; 856defm : Zn3WriteResIntPair<WriteBLS, [Zn3ALU0123], 2, [2], 2, /*LoadUOps=*/1>; 857defm : Zn3WriteResIntPair<WriteBZHI, [Zn3ALU12], 1, [1], 1, /*LoadUOps=*/1>; 858 859// Idioms that clear a register, like xorps %xmm0, %xmm0. 860// These can often bypass execution ports completely. 861defm : Zn3WriteResInt<WriteZero, [Zn3ALU0123], 0, [0], 1>; 862 863// Branches don't produce values, so they have no latency, but they still 864// consume resources. Indirect branches can fold loads. 865defm : Zn3WriteResIntPair<WriteJump, [Zn3BRU01], 1, [1], 1>; // FIXME: not from llvm-exegesis 866 867// Floating point. This covers both scalar and vector operations. 868defm : Zn3WriteResInt<WriteFLD0, [Zn3FPLd01, Zn3Load, Zn3FPP1], !add(Znver3Model.LoadLatency, 4), [1, 1, 1], 1>; 869defm : Zn3WriteResInt<WriteFLD1, [Zn3FPLd01, Zn3Load, Zn3FPP1], !add(Znver3Model.LoadLatency, 7), [1, 1, 1], 1>; 870defm : Zn3WriteResInt<WriteFLDC, [Zn3FPLd01, Zn3Load, Zn3FPP1], !add(Znver3Model.LoadLatency, 7), [1, 1, 1], 1>; 871defm : Zn3WriteResXMM<WriteFLoad, [Zn3FPLd01, Zn3Load], !add(Znver3Model.VecLoadLatency, 1), [1, 1], 1>; 872defm : Zn3WriteResXMM<WriteFLoadX, [Zn3FPLd01, Zn3Load], !add(Znver3Model.VecLoadLatency, 1), [1, 1], 1>; 873defm : Zn3WriteResYMM<WriteFLoadY, [Zn3FPLd01, Zn3Load], !add(Znver3Model.VecLoadLatency, 1), [1, 1], 1>; 874defm : Zn3WriteResXMM<WriteFMaskedLoad, [Zn3FPLd01, Zn3Load], !add(Znver3Model.VecLoadLatency, 1), [1, 1], 1>; 875defm : Zn3WriteResYMM<WriteFMaskedLoadY, [Zn3FPLd01, Zn3Load], !add(Znver3Model.VecLoadLatency, 1), [1, 1], 1>; 876defm : Zn3WriteResXMM<WriteFStore, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [1, 1], 1>; 877 878def Zn3WriteWriteFStoreMMX : SchedWriteRes<[Zn3FPSt, Zn3Store]> { 879 let Latency = 2; // FIXME: not from llvm-exegesis 880 let ResourceCycles = [1, 1]; 881 let NumMicroOps = 2; 882} 883def : InstRW<[Zn3WriteWriteFStoreMMX], (instrs MOVHPDmr, MOVHPSmr, 884 VMOVHPDmr, VMOVHPSmr)>; 885 886defm : Zn3WriteResXMM<WriteFStoreX, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [1, 1], 1>; 887defm : Zn3WriteResYMM<WriteFStoreY, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [1, 1], 1>; 888defm : Zn3WriteResXMM<WriteFStoreNT, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [1, 1], 1>; 889defm : Zn3WriteResXMM<WriteFStoreNTX, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [1, 1], 1>; 890defm : Zn3WriteResYMM<WriteFStoreNTY, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [1, 1], 1>; 891 892defm : Zn3WriteResXMM<WriteFMaskedStore32, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [6, 1], 18>; 893defm : Zn3WriteResXMM<WriteFMaskedStore64, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [4, 1], 10>; 894defm : Zn3WriteResYMM<WriteFMaskedStore32Y, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [12, 1], 42>; 895defm : Zn3WriteResYMM<WriteFMaskedStore64Y, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [6, 1], 18>; 896 897defm : Zn3WriteResXMMPair<WriteFAdd, [Zn3FPFAdd01], 3, [1], 1>; // Floating point add/sub. 898 899def Zn3WriteX87Arith : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPU0123]> { 900 let Latency = !add(Znver3Model.LoadLatency, 1); // FIXME: not from llvm-exegesis 901 let ResourceCycles = [1, 1, 24]; 902 let NumMicroOps = 2; 903} 904def : InstRW<[Zn3WriteX87Arith], (instrs ADD_FI16m, ADD_FI32m, 905 SUB_FI16m, SUB_FI32m, 906 SUBR_FI16m, SUBR_FI32m, 907 MUL_FI16m, MUL_FI32m)>; 908 909def Zn3WriteX87Div : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPU0123]> { 910 let Latency = !add(Znver3Model.LoadLatency, 1); // FIXME: not from llvm-exegesis 911 let ResourceCycles = [1, 1, 62]; 912 let NumMicroOps = 2; 913} 914def : InstRW<[Zn3WriteX87Div], (instrs DIV_FI16m, DIV_FI32m, 915 DIVR_FI16m, DIVR_FI32m)>; 916 917defm : Zn3WriteResXMMPair<WriteFAddX, [Zn3FPFAdd01], 3, [1], 1>; // Floating point add/sub (XMM). 918defm : Zn3WriteResYMMPair<WriteFAddY, [Zn3FPFAdd01], 3, [1], 1>; // Floating point add/sub (YMM). 919defm : X86WriteResPairUnsupported<WriteFAddZ>; // Floating point add/sub (ZMM). 920defm : Zn3WriteResXMMPair<WriteFAdd64, [Zn3FPFAdd01], 3, [1], 1>; // Floating point double add/sub. 921defm : Zn3WriteResXMMPair<WriteFAdd64X, [Zn3FPFAdd01], 3, [1], 1>; // Floating point double add/sub (XMM). 922defm : Zn3WriteResYMMPair<WriteFAdd64Y, [Zn3FPFAdd01], 3, [1], 1>; // Floating point double add/sub (YMM). 923defm : X86WriteResPairUnsupported<WriteFAdd64Z>; // Floating point double add/sub (ZMM). 924defm : Zn3WriteResXMMPair<WriteFCmp, [Zn3FPFMul01], 1, [1], 1>; // Floating point compare. 925defm : Zn3WriteResXMMPair<WriteFCmpX, [Zn3FPFMul01], 1, [1], 1>; // Floating point compare (XMM). 926defm : Zn3WriteResYMMPair<WriteFCmpY, [Zn3FPFMul01], 1, [1], 1>; // Floating point compare (YMM). 927defm : X86WriteResPairUnsupported<WriteFCmpZ>; // Floating point compare (ZMM). 928defm : Zn3WriteResXMMPair<WriteFCmp64, [Zn3FPFMul01], 1, [1], 1>; // Floating point double compare. 929defm : Zn3WriteResXMMPair<WriteFCmp64X, [Zn3FPFMul01], 1, [1], 1>; // Floating point double compare (XMM). 930defm : Zn3WriteResYMMPair<WriteFCmp64Y, [Zn3FPFMul01], 1, [1], 1>; // Floating point double compare (YMM). 931defm : X86WriteResPairUnsupported<WriteFCmp64Z>; // Floating point double compare (ZMM). 932defm : Zn3WriteResXMMPair<WriteFCom, [Zn3FPFMul01], 3, [2], 1>; // FIXME: latency not from llvm-exegesis // Floating point compare to flags (X87). 933defm : Zn3WriteResXMMPair<WriteFComX, [Zn3FPFMul01], 4, [2], 2>; // FIXME: latency not from llvm-exegesis // Floating point compare to flags (SSE). 934defm : Zn3WriteResXMMPair<WriteFMul, [Zn3FPFMul01], 3, [1], 1>; // Floating point multiplication. 935defm : Zn3WriteResXMMPair<WriteFMulX, [Zn3FPFMul01], 3, [1], 1>; // Floating point multiplication (XMM). 936defm : Zn3WriteResYMMPair<WriteFMulY, [Zn3FPFMul01], 3, [1], 1>; // Floating point multiplication (YMM). 937defm : X86WriteResPairUnsupported<WriteFMulZ>; // Floating point multiplication (YMM). 938defm : Zn3WriteResXMMPair<WriteFMul64, [Zn3FPFMul01], 3, [1], 1>; // Floating point double multiplication. 939defm : Zn3WriteResXMMPair<WriteFMul64X, [Zn3FPFMul01], 3, [1], 1>; // Floating point double multiplication (XMM). 940defm : Zn3WriteResYMMPair<WriteFMul64Y, [Zn3FPFMul01], 3, [1], 1>; // Floating point double multiplication (YMM). 941defm : X86WriteResPairUnsupported<WriteFMul64Z>; // Floating point double multiplication (ZMM). 942defm : Zn3WriteResXMMPair<WriteFDiv, [Zn3FPFDiv], 11, [3], 1>; // Floating point division. 943defm : Zn3WriteResXMMPair<WriteFDivX, [Zn3FPFDiv], 11, [3], 1>; // Floating point division (XMM). 944defm : Zn3WriteResYMMPair<WriteFDivY, [Zn3FPFDiv], 11, [3], 1>; // Floating point division (YMM). 945defm : X86WriteResPairUnsupported<WriteFDivZ>; // Floating point division (ZMM). 946defm : Zn3WriteResXMMPair<WriteFDiv64, [Zn3FPFDiv], 13, [5], 1>; // Floating point double division. 947defm : Zn3WriteResXMMPair<WriteFDiv64X, [Zn3FPFDiv], 13, [5], 1>; // Floating point double division (XMM). 948defm : Zn3WriteResYMMPair<WriteFDiv64Y, [Zn3FPFDiv], 13, [5], 1>; // Floating point double division (YMM). 949defm : X86WriteResPairUnsupported<WriteFDiv64Z>; // Floating point double division (ZMM). 950defm : Zn3WriteResXMMPair<WriteFSqrt, [Zn3FPFDiv], 15, [5], 1>; // Floating point square root. 951defm : Zn3WriteResXMMPair<WriteFSqrtX, [Zn3FPFDiv], 15, [5], 1>; // Floating point square root (XMM). 952defm : Zn3WriteResYMMPair<WriteFSqrtY, [Zn3FPFDiv], 15, [5], 1>; // Floating point square root (YMM). 953defm : X86WriteResPairUnsupported<WriteFSqrtZ>; // Floating point square root (ZMM). 954defm : Zn3WriteResXMMPair<WriteFSqrt64, [Zn3FPFDiv], 21, [9], 1>; // Floating point double square root. 955defm : Zn3WriteResXMMPair<WriteFSqrt64X, [Zn3FPFDiv], 21, [9], 1>; // Floating point double square root (XMM). 956defm : Zn3WriteResYMMPair<WriteFSqrt64Y, [Zn3FPFDiv], 21, [9], 1>; // Floating point double square root (YMM). 957defm : X86WriteResPairUnsupported<WriteFSqrt64Z>; // Floating point double square root (ZMM). 958defm : Zn3WriteResXMMPair<WriteFSqrt80, [Zn3FPFDiv], 22, [23], 1>; // FIXME: latency not from llvm-exegesis // Floating point long double square root. 959defm : Zn3WriteResXMMPair<WriteFRcp, [Zn3FPFMul01], 3, [1], 1>; // Floating point reciprocal estimate. 960defm : Zn3WriteResXMMPair<WriteFRcpX, [Zn3FPFMul01], 3, [1], 1>; // Floating point reciprocal estimate (XMM). 961defm : Zn3WriteResYMMPair<WriteFRcpY, [Zn3FPFMul01], 3, [1], 1>; // Floating point reciprocal estimate (YMM). 962defm : X86WriteResPairUnsupported<WriteFRcpZ>; // Floating point reciprocal estimate (ZMM). 963defm : Zn3WriteResXMMPair<WriteFRsqrt, [Zn3FPFDiv], 3, [1], 1>; // Floating point reciprocal square root estimate. 964defm : Zn3WriteResXMMPair<WriteFRsqrtX, [Zn3FPFDiv], 3, [1], 1>; // Floating point reciprocal square root estimate (XMM). 965defm : Zn3WriteResYMMPair<WriteFRsqrtY, [Zn3FPFDiv], 3, [1], 1>; // Floating point reciprocal square root estimate (YMM). 966defm : X86WriteResPairUnsupported<WriteFRsqrtZ>; // Floating point reciprocal square root estimate (ZMM). 967defm : Zn3WriteResXMMPair<WriteFMA, [Zn3FPFMul01], 4, [2], 1>; // Fused Multiply Add. 968defm : Zn3WriteResXMMPair<WriteFMAX, [Zn3FPFMul01], 4, [2], 1>; // Fused Multiply Add (XMM). 969defm : Zn3WriteResYMMPair<WriteFMAY, [Zn3FPFMul01], 4, [2], 1>; // Fused Multiply Add (YMM). 970defm : X86WriteResPairUnsupported<WriteFMAZ>; // Fused Multiply Add (ZMM). 971defm : Zn3WriteResXMMPair<WriteDPPD, [Zn3FPFMul01], 9, [6], 3, /*LoadUOps=*/2>; // Floating point double dot product. 972defm : Zn3WriteResXMMPair<WriteDPPS, [Zn3FPFMul01], 15, [8], 8, /*LoadUOps=*/2>; // Floating point single dot product. 973defm : Zn3WriteResYMMPair<WriteDPPSY, [Zn3FPFMul01], 15, [8], 7, /*LoadUOps=*/1>; // Floating point single dot product (YMM). 974defm : X86WriteResPairUnsupported<WriteDPPSZ>; // Floating point single dot product (ZMM). 975defm : Zn3WriteResXMMPair<WriteFSign, [Zn3FPFMul01], 1, [2], 1>; // FIXME: latency not from llvm-exegesis // Floating point fabs/fchs. 976defm : Zn3WriteResXMMPair<WriteFRnd, [Zn3FPFCvt01], 3, [1], 1>; // Floating point rounding. 977defm : Zn3WriteResYMMPair<WriteFRndY, [Zn3FPFCvt01], 3, [1], 1>; // Floating point rounding (YMM). 978defm : X86WriteResPairUnsupported<WriteFRndZ>; // Floating point rounding (ZMM). 979defm : Zn3WriteResXMMPair<WriteFLogic, [Zn3FPVMisc0123], 1, [1], 1>; // Floating point and/or/xor logicals. 980defm : Zn3WriteResYMMPair<WriteFLogicY, [Zn3FPVMisc0123], 1, [1], 1>; // Floating point and/or/xor logicals (YMM). 981defm : X86WriteResPairUnsupported<WriteFLogicZ>; // Floating point and/or/xor logicals (ZMM). 982defm : Zn3WriteResXMMPair<WriteFTest, [Zn3FPFMisc12], 1, [2], 2>; // FIXME: latency not from llvm-exegesis // Floating point TEST instructions. 983defm : Zn3WriteResYMMPair<WriteFTestY, [Zn3FPFMisc12], 1, [2], 2>; // FIXME: latency not from llvm-exegesis // Floating point TEST instructions (YMM). 984defm : X86WriteResPairUnsupported<WriteFTestZ>; // Floating point TEST instructions (ZMM). 985defm : Zn3WriteResXMMPair<WriteFShuffle, [Zn3FPVShuf01], 1, [1], 1>; // Floating point vector shuffles. 986defm : Zn3WriteResYMMPair<WriteFShuffleY, [Zn3FPVShuf01], 1, [1], 1>; // Floating point vector shuffles (YMM). 987defm : X86WriteResPairUnsupported<WriteFShuffleZ>; // Floating point vector shuffles (ZMM). 988defm : Zn3WriteResXMMPair<WriteFVarShuffle, [Zn3FPVShuf01], 3, [1], 1>; // Floating point vector variable shuffles. 989defm : Zn3WriteResYMMPair<WriteFVarShuffleY, [Zn3FPVShuf01], 3, [1], 1>; // Floating point vector variable shuffles (YMM). 990defm : X86WriteResPairUnsupported<WriteFVarShuffleZ>; // Floating point vector variable shuffles (ZMM). 991defm : Zn3WriteResXMMPair<WriteFBlend, [Zn3FPFMul01], 1, [1], 1>; // Floating point vector blends. 992defm : Zn3WriteResYMMPair<WriteFBlendY, [Zn3FPFMul01], 1, [1], 1>; // Floating point vector blends (YMM). 993defm : X86WriteResPairUnsupported<WriteFBlendZ>; // Floating point vector blends (ZMM). 994defm : Zn3WriteResXMMPair<WriteFVarBlend, [Zn3FPFMul01], 1, [1], 1>; // Fp vector variable blends. 995defm : Zn3WriteResYMMPair<WriteFVarBlendY, [Zn3FPFMul01], 1, [1], 1>; // Fp vector variable blends (YMM). 996defm : X86WriteResPairUnsupported<WriteFVarBlendZ>; // Fp vector variable blends (ZMM). 997 998// Horizontal Add/Sub (float and integer) 999defm : Zn3WriteResXMMPair<WriteFHAdd, [Zn3FPFAdd0], 6, [2], 4>; 1000defm : Zn3WriteResYMMPair<WriteFHAddY, [Zn3FPFAdd0], 6, [2], 3, /*LoadUOps=*/1>; 1001defm : X86WriteResPairUnsupported<WriteFHAddZ>; 1002defm : Zn3WriteResXMMPair<WritePHAdd, [Zn3FPVAdd0], 2, [2], 3, /*LoadUOps=*/1>; 1003defm : Zn3WriteResXMMPair<WritePHAddX, [Zn3FPVAdd0], 2, [2], 4>; 1004defm : Zn3WriteResYMMPair<WritePHAddY, [Zn3FPVAdd0], 2, [2], 3, /*LoadUOps=*/1>; 1005defm : X86WriteResPairUnsupported<WritePHAddZ>; 1006 1007// Vector integer operations. 1008defm : Zn3WriteResXMM<WriteVecLoad, [Zn3FPLd01, Zn3Load], !add(Znver3Model.VecLoadLatency, 1), [1, 1], 1>; 1009defm : Zn3WriteResXMM<WriteVecLoadX, [Zn3FPLd01, Zn3Load], !add(Znver3Model.VecLoadLatency, 1), [1, 1], 1>; 1010defm : Zn3WriteResYMM<WriteVecLoadY, [Zn3FPLd01, Zn3Load], !add(Znver3Model.VecLoadLatency, 1), [1, 1], 1>; 1011defm : Zn3WriteResXMM<WriteVecLoadNT, [Zn3FPLd01, Zn3Load], !add(Znver3Model.VecLoadLatency, 1), [1, 1], 1>; 1012defm : Zn3WriteResYMM<WriteVecLoadNTY, [Zn3FPLd01, Zn3Load], !add(Znver3Model.VecLoadLatency, 1), [1, 1], 1>; 1013defm : Zn3WriteResXMM<WriteVecMaskedLoad, [Zn3FPLd01, Zn3Load], !add(Znver3Model.VecLoadLatency, 1), [1, 1], 1>; 1014defm : Zn3WriteResYMM<WriteVecMaskedLoadY, [Zn3FPLd01, Zn3Load], !add(Znver3Model.VecLoadLatency, 1), [1, 1], 1>; 1015defm : Zn3WriteResXMM<WriteVecStore, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [1, 1], 1>; 1016defm : Zn3WriteResXMM<WriteVecStoreX, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [1, 1], 1>; 1017 1018def Zn3WriteVEXTRACTF128rr_VEXTRACTI128rr : SchedWriteRes<[Zn3FPFMisc0]> { 1019 let Latency = 4; 1020 let ResourceCycles = [1]; 1021 let NumMicroOps = 1; 1022} 1023def : InstRW<[Zn3WriteVEXTRACTF128rr_VEXTRACTI128rr], (instrs VEXTRACTF128rr, VEXTRACTI128rr)>; 1024 1025def Zn3WriteVEXTRACTI128mr : SchedWriteRes<[Zn3FPFMisc0, Zn3FPSt, Zn3Store]> { 1026 let Latency = !add(Znver3Model.LoadLatency, Zn3WriteVEXTRACTF128rr_VEXTRACTI128rr.Latency); 1027 let ResourceCycles = [1, 1, 1]; 1028 let NumMicroOps = !add(Zn3WriteVEXTRACTF128rr_VEXTRACTI128rr.NumMicroOps, 1); 1029} 1030def : InstRW<[Zn3WriteVEXTRACTI128mr], (instrs VEXTRACTI128mr, VEXTRACTF128mr)>; 1031 1032def Zn3WriteVINSERTF128rmr : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPFMisc0]> { 1033 let Latency = !add(Znver3Model.LoadLatency, Zn3WriteVEXTRACTF128rr_VEXTRACTI128rr.Latency); 1034 let ResourceCycles = [1, 1, 1]; 1035 let NumMicroOps = !add(Zn3WriteVEXTRACTF128rr_VEXTRACTI128rr.NumMicroOps, 0); 1036} 1037def : InstRW<[Zn3WriteVINSERTF128rmr], (instrs VINSERTF128rm)>; 1038 1039defm : Zn3WriteResYMM<WriteVecStoreY, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [1, 1], 1>; 1040defm : Zn3WriteResXMM<WriteVecStoreNT, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [1, 1], 1>; 1041defm : Zn3WriteResYMM<WriteVecStoreNTY, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [1, 1], 1>; 1042defm : Zn3WriteResXMM<WriteVecMaskedStore32, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [6, 1], 18>; 1043defm : Zn3WriteResXMM<WriteVecMaskedStore64, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [4, 1], 10>; 1044defm : Zn3WriteResYMM<WriteVecMaskedStore32Y, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [12, 1], 42>; 1045defm : Zn3WriteResYMM<WriteVecMaskedStore64Y, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [6, 1], 18>; 1046 1047defm : Zn3WriteResXMM<WriteVecMoveToGpr, [Zn3FPLd01], 1, [2], 1>; 1048defm : Zn3WriteResXMM<WriteVecMoveFromGpr, [Zn3FPLd01], 1, [2], 1>; 1049 1050def Zn3WriteMOVMMX : SchedWriteRes<[Zn3FPLd01, Zn3FPFMisc0123]> { 1051 let Latency = 1; 1052 let ResourceCycles = [1, 2]; 1053 let NumMicroOps = 2; 1054} 1055def : InstRW<[Zn3WriteMOVMMX], (instrs MMX_MOVQ2FR64rr, MMX_MOVQ2DQrr)>; 1056 1057def Zn3WriteMOVMMXSlow : SchedWriteRes<[Zn3FPLd01, Zn3FPFMisc0123]> { 1058 let Latency = 1; 1059 let ResourceCycles = [1, 4]; 1060 let NumMicroOps = 2; 1061} 1062def : InstRW<[Zn3WriteMOVMMXSlow], (instrs MMX_MOVD64rr, MMX_MOVD64to64rr)>; 1063 1064defm : Zn3WriteResXMMPair<WriteVecALU, [Zn3FPVAdd0123], 1, [1], 1>; // Vector integer ALU op, no logicals. 1065 1066def Zn3WriteEXTRQ_INSERTQ : SchedWriteRes<[Zn3FPVShuf01, Zn3FPLd01]> { 1067 let Latency = 3; 1068 let ResourceCycles = [1, 1]; 1069 let NumMicroOps = 1; 1070} 1071def : InstRW<[Zn3WriteEXTRQ_INSERTQ], (instrs EXTRQ, INSERTQ)>; 1072 1073def Zn3WriteEXTRQI_INSERTQI : SchedWriteRes<[Zn3FPVShuf01, Zn3FPLd01]> { 1074 let Latency = 3; 1075 let ResourceCycles = [1, 1]; 1076 let NumMicroOps = 2; 1077} 1078def : InstRW<[Zn3WriteEXTRQI_INSERTQI], (instrs EXTRQI, INSERTQI)>; 1079 1080defm : Zn3WriteResXMMPair<WriteVecALUX, [Zn3FPVAdd0123], 1, [1], 1>; // Vector integer ALU op, no logicals (XMM). 1081 1082def Zn3WriteVecALUXSlow : SchedWriteRes<[Zn3FPVAdd01]> { 1083 let Latency = 1; 1084 let ResourceCycles = [1]; 1085 let NumMicroOps = 1; 1086} 1087def : InstRW<[Zn3WriteVecALUXSlow], (instrs PABSBrr, PABSDrr, PABSWrr, 1088 PADDSBrr, PADDSWrr, PADDUSBrr, PADDUSWrr, 1089 PAVGBrr, PAVGWrr, 1090 PSIGNBrr, PSIGNDrr, PSIGNWrr, 1091 VPABSBrr, VPABSDrr, VPABSWrr, 1092 VPADDSBrr, VPADDSWrr, VPADDUSBrr, VPADDUSWrr, 1093 VPAVGBrr, VPAVGWrr, 1094 VPCMPEQQrr, 1095 VPSIGNBrr, VPSIGNDrr, VPSIGNWrr, 1096 PSUBSBrr, PSUBSWrr, PSUBUSBrr, PSUBUSWrr, VPSUBSBrr, VPSUBSWrr, VPSUBUSBrr, VPSUBUSWrr)>; 1097 1098def Zn3WriteVecALUXMMX : SchedWriteRes<[Zn3FPVAdd01]> { 1099 let Latency = 1; 1100 let ResourceCycles = [1]; 1101 let NumMicroOps = 1; 1102} 1103def : InstRW<[Zn3WriteVecALUXMMX], (instrs MMX_PABSBrr, MMX_PABSDrr, MMX_PABSWrr, 1104 MMX_PSIGNBrr, MMX_PSIGNDrr, MMX_PSIGNWrr, 1105 MMX_PADDSBirr, MMX_PADDSWirr, MMX_PADDUSBirr, MMX_PADDUSWirr, 1106 MMX_PAVGBirr, MMX_PAVGWirr, 1107 MMX_PSUBSBirr, MMX_PSUBSWirr, MMX_PSUBUSBirr, MMX_PSUBUSWirr)>; 1108 1109defm : Zn3WriteResYMMPair<WriteVecALUY, [Zn3FPVAdd0123], 1, [1], 1>; // Vector integer ALU op, no logicals (YMM). 1110 1111def Zn3WriteVecALUYSlow : SchedWriteRes<[Zn3FPVAdd01]> { 1112 let Latency = 1; 1113 let ResourceCycles = [1]; 1114 let NumMicroOps = 1; 1115} 1116def : InstRW<[Zn3WriteVecALUYSlow], (instrs VPABSBYrr, VPABSDYrr, VPABSWYrr, 1117 VPADDSBYrr, VPADDSWYrr, VPADDUSBYrr, VPADDUSWYrr, 1118 VPSUBSBYrr, VPSUBSWYrr, VPSUBUSBYrr, VPSUBUSWYrr, 1119 VPAVGBYrr, VPAVGWYrr, 1120 VPCMPEQQYrr, 1121 VPSIGNBYrr, VPSIGNDYrr, VPSIGNWYrr)>; 1122 1123defm : X86WriteResPairUnsupported<WriteVecALUZ>; // Vector integer ALU op, no logicals (ZMM). 1124defm : Zn3WriteResXMMPair<WriteVecLogic, [Zn3FPVMisc0123], 1, [1], 1>; // Vector integer and/or/xor logicals. 1125defm : Zn3WriteResXMMPair<WriteVecLogicX, [Zn3FPVMisc0123], 1, [1], 1>; // Vector integer and/or/xor logicals (XMM). 1126defm : Zn3WriteResYMMPair<WriteVecLogicY, [Zn3FPVMisc0123], 1, [1], 1>; // Vector integer and/or/xor logicals (YMM). 1127defm : X86WriteResPairUnsupported<WriteVecLogicZ>; // Vector integer and/or/xor logicals (ZMM). 1128defm : Zn3WriteResXMMPair<WriteVecTest, [Zn3FPVAdd12, Zn3FPSt], 1, [1, 1], 2>; // FIXME: latency not from llvm-exegesis // Vector integer TEST instructions. 1129defm : Zn3WriteResYMMPair<WriteVecTestY, [Zn3FPVAdd12, Zn3FPSt], 1, [1, 1], 2>; // FIXME: latency not from llvm-exegesis // Vector integer TEST instructions (YMM). 1130defm : X86WriteResPairUnsupported<WriteVecTestZ>; // Vector integer TEST instructions (ZMM). 1131defm : Zn3WriteResXMMPair<WriteVecShift, [Zn3FPVShift01], 1, [1], 1>; // Vector integer shifts (default). 1132defm : Zn3WriteResXMMPair<WriteVecShiftX, [Zn3FPVShift01], 1, [1], 1>; // Vector integer shifts (XMM). 1133defm : Zn3WriteResYMMPair<WriteVecShiftY, [Zn3FPVShift01], 1, [1], 1>; // Vector integer shifts (YMM). 1134defm : X86WriteResPairUnsupported<WriteVecShiftZ>; // Vector integer shifts (ZMM). 1135defm : Zn3WriteResXMMPair<WriteVecShiftImm, [Zn3FPVShift01], 1, [1], 1>; // Vector integer immediate shifts (default). 1136defm : Zn3WriteResXMMPair<WriteVecShiftImmX, [Zn3FPVShift01], 1, [1], 1>; // Vector integer immediate shifts (XMM). 1137defm : Zn3WriteResYMMPair<WriteVecShiftImmY, [Zn3FPVShift01], 1, [1], 1>; // Vector integer immediate shifts (YMM). 1138defm : X86WriteResPairUnsupported<WriteVecShiftImmZ>; // Vector integer immediate shifts (ZMM). 1139defm : Zn3WriteResXMMPair<WriteVecIMul, [Zn3FPVMul01], 3, [1], 1>; // Vector integer multiply (default). 1140defm : Zn3WriteResXMMPair<WriteVecIMulX, [Zn3FPVMul01], 3, [1], 1>; // Vector integer multiply (XMM). 1141defm : Zn3WriteResYMMPair<WriteVecIMulY, [Zn3FPVMul01], 3, [1], 1>; // Vector integer multiply (YMM). 1142defm : X86WriteResPairUnsupported<WriteVecIMulZ>; // Vector integer multiply (ZMM). 1143defm : Zn3WriteResXMMPair<WritePMULLD, [Zn3FPVMul01], 3, [1], 1>; // Vector PMULLD. 1144defm : Zn3WriteResYMMPair<WritePMULLDY, [Zn3FPVMul01], 3, [1], 1>; // Vector PMULLD (YMM). 1145defm : X86WriteResPairUnsupported<WritePMULLDZ>; // Vector PMULLD (ZMM). 1146defm : Zn3WriteResXMMPair<WriteShuffle, [Zn3FPVShuf01], 1, [1], 1>; // Vector shuffles. 1147defm : Zn3WriteResXMMPair<WriteShuffleX, [Zn3FPVShuf01], 1, [1], 1>; // Vector shuffles (XMM). 1148defm : Zn3WriteResYMMPair<WriteShuffleY, [Zn3FPVShuf01], 1, [1], 1>; // Vector shuffles (YMM). 1149defm : X86WriteResPairUnsupported<WriteShuffleZ>; // Vector shuffles (ZMM). 1150defm : Zn3WriteResXMMPair<WriteVarShuffle, [Zn3FPVShuf01], 1, [1], 1>; // Vector variable shuffles. 1151defm : Zn3WriteResXMMPair<WriteVarShuffleX, [Zn3FPVShuf01], 1, [1], 1>; // Vector variable shuffles (XMM). 1152defm : Zn3WriteResYMMPair<WriteVarShuffleY, [Zn3FPVShuf01], 1, [1], 1>; // Vector variable shuffles (YMM). 1153defm : X86WriteResPairUnsupported<WriteVarShuffleZ>; // Vector variable shuffles (ZMM). 1154defm : Zn3WriteResXMMPair<WriteBlend, [Zn3FPVMisc0123], 1, [1], 1>; // Vector blends. 1155defm : Zn3WriteResYMMPair<WriteBlendY, [Zn3FPVMisc0123], 1, [1], 1>; // Vector blends (YMM). 1156defm : X86WriteResPairUnsupported<WriteBlendZ>; // Vector blends (ZMM). 1157defm : Zn3WriteResXMMPair<WriteVarBlend, [Zn3FPVMul01], 1, [1], 1>; // Vector variable blends. 1158defm : Zn3WriteResYMMPair<WriteVarBlendY, [Zn3FPVMul01], 1, [1], 1>; // Vector variable blends (YMM). 1159defm : X86WriteResPairUnsupported<WriteVarBlendZ>; // Vector variable blends (ZMM). 1160defm : Zn3WriteResXMMPair<WritePSADBW, [Zn3FPVAdd0123], 3, [2], 1>; // Vector PSADBW. 1161defm : Zn3WriteResXMMPair<WritePSADBWX, [Zn3FPVAdd0123], 3, [2], 1>; // Vector PSADBW (XMM). 1162defm : Zn3WriteResYMMPair<WritePSADBWY, [Zn3FPVAdd0123], 3, [2], 1>; // Vector PSADBW (YMM). 1163defm : X86WriteResPairUnsupported<WritePSADBWZ>; // Vector PSADBW (ZMM). 1164defm : Zn3WriteResXMMPair<WriteMPSAD, [Zn3FPVAdd0123], 4, [8], 4, /*LoadUOps=*/2>; // Vector MPSAD. 1165defm : Zn3WriteResYMMPair<WriteMPSADY, [Zn3FPVAdd0123], 4, [8], 3, /*LoadUOps=*/1>; // Vector MPSAD (YMM). 1166defm : X86WriteResPairUnsupported<WriteMPSADZ>; // Vector MPSAD (ZMM). 1167defm : Zn3WriteResXMMPair<WritePHMINPOS, [Zn3FPVAdd01], 3, [1], 1>; // Vector PHMINPOS. 1168 1169// Vector insert/extract operations. 1170defm : Zn3WriteResXMMPair<WriteVecInsert, [Zn3FPLd01], 1, [2], 2, /*LoadUOps=*/-1>; // Insert gpr to vector element. 1171defm : Zn3WriteResXMM<WriteVecExtract, [Zn3FPLd01], 1, [2], 2>; // Extract vector element to gpr. 1172defm : Zn3WriteResXMM<WriteVecExtractSt, [Zn3FPSt, Zn3Store], !add(1, Znver3Model.StoreLatency), [1, 1], 2>; // Extract vector element and store. 1173 1174// MOVMSK operations. 1175defm : Zn3WriteResXMM<WriteFMOVMSK, [Zn3FPVMisc2], 1, [1], 1>; 1176defm : Zn3WriteResXMM<WriteVecMOVMSK, [Zn3FPVMisc2], 1, [1], 1>; 1177defm : Zn3WriteResYMM<WriteVecMOVMSKY, [Zn3FPVMisc2], 1, [1], 1>; 1178defm : Zn3WriteResXMM<WriteMMXMOVMSK, [Zn3FPVMisc2], 1, [1], 1>; 1179 1180// Conversion between integer and float. 1181defm : Zn3WriteResXMMPair<WriteCvtSD2I, [Zn3FPFCvt01], 2, [2], 2>; // Double -> Integer. 1182defm : Zn3WriteResXMMPair<WriteCvtPD2I, [Zn3FPFCvt01], 3, [1], 1>; // Double -> Integer (XMM). 1183defm : Zn3WriteResYMMPair<WriteCvtPD2IY, [Zn3FPFCvt01], 6, [2], 2>; // Double -> Integer (YMM). 1184defm : X86WriteResPairUnsupported<WriteCvtPD2IZ>; // Double -> Integer (ZMM). 1185 1186def Zn3WriteCvtPD2IMMX : SchedWriteRes<[Zn3FPFCvt01]> { 1187 let Latency = 1; 1188 let ResourceCycles = [2]; 1189 let NumMicroOps = 2; 1190} 1191def : InstRW<[Zn3WriteCvtPD2IMMX], (instrs MMX_CVTPD2PIirm, MMX_CVTTPD2PIirm, MMX_CVTPD2PIirr, MMX_CVTTPD2PIirr)>; 1192 1193defm : Zn3WriteResXMMPair<WriteCvtSS2I, [Zn3FPFCvt01], 2, [2], 2>; // Float -> Integer. 1194 1195defm : Zn3WriteResXMMPair<WriteCvtPS2I, [Zn3FPFCvt01], 3, [1], 1>; // Float -> Integer (XMM). 1196defm : Zn3WriteResYMMPair<WriteCvtPS2IY, [Zn3FPFCvt01], 3, [1], 1>; // Float -> Integer (YMM). 1197defm : X86WriteResPairUnsupported<WriteCvtPS2IZ>; // Float -> Integer (ZMM). 1198 1199defm : Zn3WriteResXMMPair<WriteCvtI2SD, [Zn3FPFCvt01], 3, [2], 2, /*LoadUOps=*/-1>; // Integer -> Double. 1200defm : Zn3WriteResXMMPair<WriteCvtI2PD, [Zn3FPFCvt01], 3, [1], 1>; // Integer -> Double (XMM). 1201defm : Zn3WriteResYMMPair<WriteCvtI2PDY, [Zn3FPFCvt01], 4, [2], 2, /*LoadUOps=*/-1>; // Integer -> Double (YMM). 1202defm : X86WriteResPairUnsupported<WriteCvtI2PDZ>; // Integer -> Double (ZMM). 1203 1204def Zn3WriteCvtI2PDMMX : SchedWriteRes<[Zn3FPFCvt01]> { 1205 let Latency = 2; 1206 let ResourceCycles = [6]; 1207 let NumMicroOps = 2; 1208} 1209def : InstRW<[Zn3WriteCvtI2PDMMX], (instrs MMX_CVTPI2PDirm, MMX_CVTPI2PDirr)>; 1210 1211defm : Zn3WriteResXMMPair<WriteCvtI2SS, [Zn3FPFCvt01], 3, [2], 2, /*LoadUOps=*/-1>; // Integer -> Float. 1212defm : Zn3WriteResXMMPair<WriteCvtI2PS, [Zn3FPFCvt01], 3, [1], 1>; // Integer -> Float (XMM). 1213defm : Zn3WriteResYMMPair<WriteCvtI2PSY, [Zn3FPFCvt01], 3, [1], 1>; // Integer -> Float (YMM). 1214defm : X86WriteResPairUnsupported<WriteCvtI2PSZ>; // Integer -> Float (ZMM). 1215 1216def Zn3WriteCvtI2PSMMX : SchedWriteRes<[Zn3FPFCvt01]> { 1217 let Latency = 3; 1218 let ResourceCycles = [1]; 1219 let NumMicroOps = 2; 1220} 1221def : InstRW<[Zn3WriteCvtI2PSMMX], (instrs MMX_CVTPI2PSirr)>; 1222 1223defm : Zn3WriteResXMMPair<WriteCvtSS2SD, [Zn3FPFCvt01], 3, [1], 1>; // Float -> Double size conversion. 1224defm : Zn3WriteResXMMPair<WriteCvtPS2PD, [Zn3FPFCvt01], 3, [1], 1>; // Float -> Double size conversion (XMM). 1225defm : Zn3WriteResYMMPair<WriteCvtPS2PDY, [Zn3FPFCvt01], 4, [2], 2, /*LoadUOps=*/-1>; // Float -> Double size conversion (YMM). 1226defm : X86WriteResPairUnsupported<WriteCvtPS2PDZ>; // Float -> Double size conversion (ZMM). 1227 1228defm : Zn3WriteResXMMPair<WriteCvtSD2SS, [Zn3FPFCvt01], 3, [1], 1>; // Double -> Float size conversion. 1229defm : Zn3WriteResXMMPair<WriteCvtPD2PS, [Zn3FPFCvt01], 3, [1], 1>; // Double -> Float size conversion (XMM). 1230defm : Zn3WriteResYMMPair<WriteCvtPD2PSY, [Zn3FPFCvt01], 6, [2], 2>; // Double -> Float size conversion (YMM). 1231defm : X86WriteResPairUnsupported<WriteCvtPD2PSZ>; // Double -> Float size conversion (ZMM). 1232 1233defm : Zn3WriteResXMMPair<WriteCvtPH2PS, [Zn3FPFCvt01], 3, [1], 1>; // Half -> Float size conversion. 1234defm : Zn3WriteResYMMPair<WriteCvtPH2PSY, [Zn3FPFCvt01], 4, [2], 2, /*LoadUOps=*/-1>; // Half -> Float size conversion (YMM). 1235defm : X86WriteResPairUnsupported<WriteCvtPH2PSZ>; // Half -> Float size conversion (ZMM). 1236 1237defm : Zn3WriteResXMM<WriteCvtPS2PH, [Zn3FPFCvt01], 3, [2], 1>; // Float -> Half size conversion. 1238defm : Zn3WriteResYMM<WriteCvtPS2PHY, [Zn3FPFCvt01], 6, [2], 2>; // Float -> Half size conversion (YMM). 1239defm : X86WriteResUnsupported<WriteCvtPS2PHZ>; // Float -> Half size conversion (ZMM). 1240defm : Zn3WriteResXMM<WriteCvtPS2PHSt, [Zn3FPFCvt01, Zn3FPSt, Zn3Store], !add(3, Znver3Model.StoreLatency), [1, 1, 1], 2>; // Float -> Half + store size conversion. 1241defm : Zn3WriteResYMM<WriteCvtPS2PHYSt, [Zn3FPFCvt01, Zn3FPSt, Zn3Store], !add(6, Znver3Model.StoreLatency), [2, 1, 1], 3>; // Float -> Half + store size conversion (YMM). 1242defm : X86WriteResUnsupported<WriteCvtPS2PHZSt>; // Float -> Half + store size conversion (ZMM). 1243 1244// CRC32 instruction. 1245defm : Zn3WriteResIntPair<WriteCRC32, [Zn3ALU1], 3, [1], 1>; 1246 1247def Zn3WriteSHA1MSG1rr : SchedWriteRes<[Zn3FPU0123]> { 1248 let Latency = 2; 1249 let ResourceCycles = [2]; 1250 let NumMicroOps = 2; 1251} 1252def : InstRW<[Zn3WriteSHA1MSG1rr], (instrs SHA1MSG1rr)>; 1253 1254def Zn3WriteSHA1MSG1rm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPU0123]> { 1255 let Latency = !add(Znver3Model.LoadLatency, Zn3WriteSHA1MSG1rr.Latency); 1256 let ResourceCycles = [1, 1, 2]; 1257 let NumMicroOps = !add(Zn3WriteSHA1MSG1rr.NumMicroOps, 0); 1258} 1259def : InstRW<[Zn3WriteSHA1MSG1rm], (instrs SHA1MSG1rm)>; 1260 1261def Zn3WriteSHA1MSG2rr_SHA1NEXTErr : SchedWriteRes<[Zn3FPU0123]> { 1262 let Latency = 1; 1263 let ResourceCycles = [2]; 1264 let NumMicroOps = 1; 1265} 1266def : InstRW<[Zn3WriteSHA1MSG2rr_SHA1NEXTErr], (instrs SHA1MSG2rr, SHA1NEXTErr)>; 1267 1268def Zn3Writerm_SHA1MSG2rm_SHA1NEXTErm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPU0123]> { 1269 let Latency = !add(Znver3Model.LoadLatency, Zn3WriteSHA1MSG2rr_SHA1NEXTErr.Latency); 1270 let ResourceCycles = [1, 1, 2]; 1271 let NumMicroOps = !add(Zn3WriteSHA1MSG2rr_SHA1NEXTErr.NumMicroOps, 0); 1272} 1273def : InstRW<[Zn3Writerm_SHA1MSG2rm_SHA1NEXTErm], (instrs SHA1MSG2rm, SHA1NEXTErm)>; 1274 1275def Zn3WriteSHA256MSG1rr : SchedWriteRes<[Zn3FPU0123]> { 1276 let Latency = 2; 1277 let ResourceCycles = [3]; 1278 let NumMicroOps = 2; 1279} 1280def : InstRW<[Zn3WriteSHA256MSG1rr], (instrs SHA256MSG1rr)>; 1281 1282def Zn3Writerm_SHA256MSG1rm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPU0123]> { 1283 let Latency = !add(Znver3Model.LoadLatency, Zn3WriteSHA256MSG1rr.Latency); 1284 let ResourceCycles = [1, 1, 3]; 1285 let NumMicroOps = !add(Zn3WriteSHA256MSG1rr.NumMicroOps, 0); 1286} 1287def : InstRW<[Zn3Writerm_SHA256MSG1rm], (instrs SHA256MSG1rm)>; 1288 1289def Zn3WriteSHA256MSG2rr : SchedWriteRes<[Zn3FPU0123]> { 1290 let Latency = 3; 1291 let ResourceCycles = [8]; 1292 let NumMicroOps = 4; 1293} 1294def : InstRW<[Zn3WriteSHA256MSG2rr], (instrs SHA256MSG2rr)>; 1295 1296def Zn3WriteSHA256MSG2rm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPU0123]> { 1297 let Latency = !add(Znver3Model.LoadLatency, Zn3WriteSHA256MSG2rr.Latency); 1298 let ResourceCycles = [1, 1, 8]; 1299 let NumMicroOps = !add(Zn3WriteSHA256MSG2rr.NumMicroOps, 1); 1300} 1301def : InstRW<[Zn3WriteSHA256MSG2rm], (instrs SHA256MSG2rm)>; 1302 1303def Zn3WriteSHA1RNDS4rri : SchedWriteRes<[Zn3FPU0123]> { 1304 let Latency = 6; 1305 let ResourceCycles = [8]; 1306 let NumMicroOps = 1; 1307} 1308def : InstRW<[Zn3WriteSHA1RNDS4rri], (instrs SHA1RNDS4rri)>; 1309 1310def Zn3WriteSHA256RNDS2rr : SchedWriteRes<[Zn3FPU0123]> { 1311 let Latency = 4; 1312 let ResourceCycles = [8]; 1313 let NumMicroOps = 1; 1314} 1315def : InstRW<[Zn3WriteSHA256RNDS2rr], (instrs SHA256RNDS2rr)>; 1316 1317// Strings instructions. 1318// Packed Compare Implicit Length Strings, Return Mask 1319defm : Zn3WriteResXMMPair<WritePCmpIStrM, [Zn3FPVAdd0123], 6, [8], 3, /*LoadUOps=*/1>; 1320// Packed Compare Explicit Length Strings, Return Mask 1321defm : Zn3WriteResXMMPair<WritePCmpEStrM, [Zn3FPVAdd0123], 6, [12], 7, /*LoadUOps=*/5>; 1322// Packed Compare Implicit Length Strings, Return Index 1323defm : Zn3WriteResXMMPair<WritePCmpIStrI, [Zn3FPVAdd0123], 2, [8], 4>; 1324// Packed Compare Explicit Length Strings, Return Index 1325defm : Zn3WriteResXMMPair<WritePCmpEStrI, [Zn3FPVAdd0123], 6, [12], 8, /*LoadUOps=*/4>; 1326 1327// AES instructions. 1328defm : Zn3WriteResXMMPair<WriteAESDecEnc, [Zn3FPAES01], 4, [1], 1>; // Decryption, encryption. 1329defm : Zn3WriteResXMMPair<WriteAESIMC, [Zn3FPAES01], 4, [1], 1>; // InvMixColumn. 1330defm : Zn3WriteResXMMPair<WriteAESKeyGen, [Zn3FPAES01], 4, [1], 1>; // Key Generation. 1331 1332// Carry-less multiplication instructions. 1333defm : Zn3WriteResXMMPair<WriteCLMul, [Zn3FPCLM01], 4, [4], 4>; 1334 1335// EMMS/FEMMS 1336defm : Zn3WriteResInt<WriteEMMS, [Zn3ALU0123], 2, [1], 1>; // FIXME: latency not from llvm-exegesis 1337 1338// Load/store MXCSR 1339defm : Zn3WriteResInt<WriteLDMXCSR, [Zn3AGU012, Zn3Load, Zn3ALU0123], !add(Znver3Model.LoadLatency, 1), [1, 1, 6], 1>; // FIXME: latency not from llvm-exegesis 1340defm : Zn3WriteResInt<WriteSTMXCSR, [Zn3ALU0123, Zn3AGU012, Zn3Store], !add(1, Znver3Model.StoreLatency), [60, 1, 1], 2>; // FIXME: latency not from llvm-exegesis 1341 1342// Catch-all for expensive system instructions. 1343defm : Zn3WriteResInt<WriteSystem, [Zn3ALU0123], 100, [100], 100>; 1344 1345def Zn3WriteVZEROUPPER : SchedWriteRes<[Zn3FPU0123]> { 1346 let Latency = 0; // FIXME: not from llvm-exegesis 1347 let ResourceCycles = [1]; 1348 let NumMicroOps = 1; 1349} 1350def : InstRW<[Zn3WriteVZEROUPPER], (instrs VZEROUPPER)>; 1351 1352def Zn3WriteVZEROALL : SchedWriteRes<[Zn3FPU0123]> { 1353 let Latency = 10; // FIXME: not from llvm-exegesis 1354 let ResourceCycles = [24]; 1355 let NumMicroOps = 18; 1356} 1357def : InstRW<[Zn3WriteVZEROALL], (instrs VZEROALL)>; 1358 1359// AVX2. 1360defm : Zn3WriteResYMMPair<WriteFShuffle256, [Zn3FPVShuf], 2, [1], 1, /*LoadUOps=*/2>; // Fp 256-bit width vector shuffles. 1361defm : Zn3WriteResYMMPair<WriteFVarShuffle256, [Zn3FPVShuf], 7, [1], 2, /*LoadUOps=*/1>; // Fp 256-bit width variable shuffles. 1362defm : Zn3WriteResYMMPair<WriteShuffle256, [Zn3FPVShuf], 2, [1], 1>; // 256-bit width vector shuffles. 1363 1364def Zn3WriteVPERM2I128rr_VPERM2F128rr : SchedWriteRes<[Zn3FPVShuf]> { 1365 let Latency = 3; 1366 let ResourceCycles = [1]; 1367 let NumMicroOps = 1; 1368} 1369def : InstRW<[Zn3WriteVPERM2I128rr_VPERM2F128rr], (instrs VPERM2I128rr, VPERM2F128rr)>; 1370 1371def Zn3WriteVPERM2F128rm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPVShuf]> { 1372 let Latency = !add(Znver3Model.LoadLatency, Zn3WriteVPERM2I128rr_VPERM2F128rr.Latency); 1373 let ResourceCycles = [1, 1, 1]; 1374 let NumMicroOps = !add(Zn3WriteVPERM2I128rr_VPERM2F128rr.NumMicroOps, 0); 1375} 1376def : InstRW<[Zn3WriteVPERM2F128rm], (instrs VPERM2F128rm)>; 1377 1378def Zn3WriteVPERMPSYrr : SchedWriteRes<[Zn3FPVShuf]> { 1379 let Latency = 7; 1380 let ResourceCycles = [1]; 1381 let NumMicroOps = 2; 1382} 1383def : InstRW<[Zn3WriteVPERMPSYrr], (instrs VPERMPSYrr)>; 1384 1385def Zn3WriteVPERMPSYrm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPVShuf]> { 1386 let Latency = !add(Znver3Model.LoadLatency, Zn3WriteVPERMPSYrr.Latency); 1387 let ResourceCycles = [1, 1, 2]; 1388 let NumMicroOps = !add(Zn3WriteVPERMPSYrr.NumMicroOps, 1); 1389} 1390def : InstRW<[Zn3WriteVPERMPSYrm], (instrs VPERMPSYrm)>; 1391 1392def Zn3WriteVPERMYri : SchedWriteRes<[Zn3FPVShuf]> { 1393 let Latency = 6; 1394 let ResourceCycles = [1]; 1395 let NumMicroOps = 2; 1396} 1397def : InstRW<[Zn3WriteVPERMYri], (instrs VPERMPDYri, VPERMQYri)>; 1398 1399def Zn3WriteVPERMPDYmi : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPVShuf]> { 1400 let Latency = !add(Znver3Model.LoadLatency, Zn3WriteVPERMYri.Latency); 1401 let ResourceCycles = [1, 1, 2]; 1402 let NumMicroOps = !add(Zn3WriteVPERMYri.NumMicroOps, 1); 1403} 1404def : InstRW<[Zn3WriteVPERMPDYmi], (instrs VPERMPDYmi)>; 1405 1406def Zn3WriteVPERMDYrr : SchedWriteRes<[Zn3FPVShuf]> { 1407 let Latency = 5; 1408 let ResourceCycles = [1]; 1409 let NumMicroOps = 2; 1410} 1411def : InstRW<[Zn3WriteVPERMDYrr], (instrs VPERMDYrr)>; 1412 1413def Zn3WriteVPERMYm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPVShuf]> { 1414 let Latency = !add(Znver3Model.LoadLatency, Zn3WriteVPERMDYrr.Latency); 1415 let ResourceCycles = [1, 1, 2]; 1416 let NumMicroOps = !add(Zn3WriteVPERMDYrr.NumMicroOps, 0); 1417} 1418def : InstRW<[Zn3WriteVPERMYm], (instrs VPERMQYmi, VPERMDYrm)>; 1419 1420defm : Zn3WriteResYMMPair<WriteVPMOV256, [Zn3FPVShuf01], 4, [3], 2, /*LoadUOps=*/-1>; // 256-bit width packed vector width-changing move. 1421defm : Zn3WriteResYMMPair<WriteVarShuffle256, [Zn3FPVShuf01], 1, [1], 2>; // 256-bit width vector variable shuffles. 1422defm : Zn3WriteResXMMPair<WriteVarVecShift, [Zn3FPVShift01], 1, [1], 1>; // Variable vector shifts. 1423defm : Zn3WriteResYMMPair<WriteVarVecShiftY, [Zn3FPVShift01], 1, [1], 1>; // Variable vector shifts (YMM). 1424defm : X86WriteResPairUnsupported<WriteVarVecShiftZ>; // Variable vector shifts (ZMM). 1425 1426// Old microcoded instructions that nobody use. 1427defm : Zn3WriteResInt<WriteMicrocoded, [Zn3ALU0123], 100, [100], 100>; 1428 1429// Fence instructions. 1430defm : Zn3WriteResInt<WriteFence, [Zn3ALU0123], 1, [100], 1>; 1431 1432def Zn3WriteLFENCE : SchedWriteRes<[Zn3LSU]> { 1433 let Latency = 1; 1434 let ResourceCycles = [30]; 1435 let NumMicroOps = 1; 1436} 1437def : InstRW<[Zn3WriteLFENCE], (instrs LFENCE)>; 1438 1439def Zn3WriteSFENCE : SchedWriteRes<[Zn3LSU]> { 1440 let Latency = 1; 1441 let ResourceCycles = [1]; 1442 let NumMicroOps = 1; 1443} 1444def : InstRW<[Zn3WriteSFENCE], (instrs SFENCE)>; 1445 1446// Nop, not very useful expect it provides a model for nops! 1447defm : Zn3WriteResInt<WriteNop, [Zn3ALU0123], 0, [1], 1>; // FIXME: latency not from llvm-exegesis 1448 1449 1450/////////////////////////////////////////////////////////////////////////////// 1451// Zero Cycle Move 1452/////////////////////////////////////////////////////////////////////////////// 1453 1454def Zn3WriteZeroLatency : SchedWriteRes<[]> { 1455 let Latency = 0; 1456 let ResourceCycles = []; 1457 let NumMicroOps = 1; 1458} 1459def : InstRW<[Zn3WriteZeroLatency], (instrs MOV32rr, MOV32rr_REV, 1460 MOV64rr, MOV64rr_REV, 1461 MOVSX32rr32)>; 1462 1463def Zn3WriteSwapRenameable : SchedWriteRes<[]> { 1464 let Latency = 0; 1465 let ResourceCycles = []; 1466 let NumMicroOps = 2; 1467} 1468def : InstRW<[Zn3WriteSwapRenameable], (instrs XCHG32rr, XCHG32ar, 1469 XCHG64rr, XCHG64ar)>; 1470 1471defm : Zn3WriteResInt<WriteXCHG, [Zn3ALU0123], 0, [8], 2>; // Compare+Exchange - TODO RMW support. 1472 1473defm : Zn3WriteResXMM<WriteFMove, [Zn3FPVMisc0123], 1, [1], 1>; // Empty sched class 1474defm : Zn3WriteResXMM<WriteFMoveX, [], 0, [], 1>; 1475defm : Zn3WriteResYMM<WriteFMoveY, [], 0, [], 1>; 1476 1477defm : Zn3WriteResXMM<WriteVecMove, [Zn3FPFMisc0123], 1, [1], 1>; // MMX 1478defm : Zn3WriteResXMM<WriteVecMoveX, [], 0, [], 1>; 1479defm : Zn3WriteResYMM<WriteVecMoveY, [], 0, [], 1>; 1480 1481def : IsOptimizableRegisterMove<[ 1482 InstructionEquivalenceClass<[ 1483 // GPR variants. 1484 MOV32rr, MOV32rr_REV, 1485 MOV64rr, MOV64rr_REV, 1486 MOVSX32rr32, 1487 XCHG32rr, XCHG32ar, 1488 XCHG64rr, XCHG64ar, 1489 1490 // MMX variants. 1491 // MMX moves are *NOT* eliminated. 1492 1493 // SSE variants. 1494 MOVAPSrr, MOVAPSrr_REV, 1495 MOVUPSrr, MOVUPSrr_REV, 1496 MOVAPDrr, MOVAPDrr_REV, 1497 MOVUPDrr, MOVUPDrr_REV, 1498 MOVDQArr, MOVDQArr_REV, 1499 MOVDQUrr, MOVDQUrr_REV, 1500 1501 // AVX variants. 1502 VMOVAPSrr, VMOVAPSrr_REV, 1503 VMOVUPSrr, VMOVUPSrr_REV, 1504 VMOVAPDrr, VMOVAPDrr_REV, 1505 VMOVUPDrr, VMOVUPDrr_REV, 1506 VMOVDQArr, VMOVDQArr_REV, 1507 VMOVDQUrr, VMOVDQUrr_REV, 1508 1509 // AVX YMM variants. 1510 VMOVAPSYrr, VMOVAPSYrr_REV, 1511 VMOVUPSYrr, VMOVUPSYrr_REV, 1512 VMOVAPDYrr, VMOVAPDYrr_REV, 1513 VMOVUPDYrr, VMOVUPDYrr_REV, 1514 VMOVDQAYrr, VMOVDQAYrr_REV, 1515 VMOVDQUYrr, VMOVDQUYrr_REV, 1516 ], TruePred > 1517]>; 1518 1519/////////////////////////////////////////////////////////////////////////////// 1520// Dependency breaking instructions. 1521/////////////////////////////////////////////////////////////////////////////// 1522 1523def Zn3WriteZeroIdiom : SchedWriteVariant<[ 1524 SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn3WriteZeroLatency]>, 1525 SchedVar<NoSchedPred, [WriteALU]> 1526]>; 1527def : InstRW<[Zn3WriteZeroIdiom], (instrs XOR32rr, XOR32rr_REV, 1528 XOR64rr, XOR64rr_REV, 1529 SUB32rr, SUB32rr_REV, 1530 SUB64rr, SUB64rr_REV)>; 1531 1532def Zn3WriteZeroIdiomEFLAGS : SchedWriteVariant<[ 1533 SchedVar<MCSchedPredicate<CheckSameRegOperand<0, 1>>, [Zn3WriteZeroLatency]>, 1534 SchedVar<NoSchedPred, [WriteALU]> 1535]>; 1536def : InstRW<[Zn3WriteZeroIdiomEFLAGS], (instrs CMP8rr, CMP8rr_REV, 1537 CMP16rr, CMP16rr_REV, 1538 CMP32rr, CMP32rr_REV, 1539 CMP64rr, CMP64rr_REV)>; 1540 1541def Zn3WriteFZeroIdiom : SchedWriteVariant<[ 1542 SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn3WriteZeroLatency]>, 1543 SchedVar<NoSchedPred, [WriteFLogic]> 1544]>; 1545// NOTE: XORPSrr, XORPDrr are not zero-cycle! 1546def : InstRW<[Zn3WriteFZeroIdiom], (instrs VXORPSrr, VXORPDrr, 1547 VANDNPSrr, VANDNPDrr)>; 1548 1549def Zn3WriteFZeroIdiomY : SchedWriteVariant<[ 1550 SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn3WriteZeroLatency]>, 1551 SchedVar<NoSchedPred, [WriteFLogicY]> 1552]>; 1553def : InstRW<[Zn3WriteFZeroIdiomY], (instrs VXORPSYrr, VXORPDYrr, 1554 VANDNPSYrr, VANDNPDYrr)>; 1555 1556def Zn3WriteVZeroIdiomLogicX : SchedWriteVariant<[ 1557 SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn3WriteZeroLatency]>, 1558 SchedVar<NoSchedPred, [WriteVecLogicX]> 1559]>; 1560// NOTE: PXORrr,PANDNrr are not zero-cycle! 1561def : InstRW<[Zn3WriteVZeroIdiomLogicX], (instrs VPXORrr, VPANDNrr)>; 1562 1563def Zn3WriteVZeroIdiomLogicY : SchedWriteVariant<[ 1564 SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn3WriteZeroLatency]>, 1565 SchedVar<NoSchedPred, [WriteVecLogicY]> 1566]>; 1567def : InstRW<[Zn3WriteVZeroIdiomLogicY], (instrs VPXORYrr, VPANDNYrr)>; 1568 1569def Zn3WriteVZeroIdiomALUX : SchedWriteVariant<[ 1570 SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn3WriteZeroLatency]>, 1571 SchedVar<NoSchedPred, [WriteVecALUX]> 1572]>; 1573// NOTE: PSUBBrr, PSUBWrr, PSUBDrr, PSUBQrr, 1574// PCMPGTBrr, PCMPGTWrr, PCMPGTDrr, PCMPGTQrr are not zero-cycle! 1575def : InstRW<[Zn3WriteVZeroIdiomALUX], 1576 (instrs VPSUBBrr, VPSUBWrr, VPSUBDrr, VPSUBQrr, 1577 VPCMPGTBrr, VPCMPGTWrr, VPCMPGTDrr, VPCMPGTQrr)>; 1578 1579def Zn3WriteVZeroIdiomALUY : SchedWriteVariant<[ 1580 SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn3WriteZeroLatency]>, 1581 SchedVar<NoSchedPred, [WriteVecALUY]> 1582]>; 1583def : InstRW<[Zn3WriteVZeroIdiomALUY], 1584 (instrs VPSUBBYrr, VPSUBWYrr, VPSUBDYrr, VPSUBQYrr, 1585 VPCMPGTBYrr, VPCMPGTWYrr, VPCMPGTDYrr, VPCMPGTQYrr)>; 1586 1587def : IsZeroIdiomFunction<[ 1588 // GPR Zero-idioms. 1589 DepBreakingClass<[ XOR32rr, XOR32rr_REV, 1590 XOR64rr, XOR64rr_REV, 1591 SUB32rr, SUB32rr_REV, 1592 SUB64rr, SUB64rr_REV ], ZeroIdiomPredicate>, 1593 1594 // SSE XMM Zero-idioms. 1595 DepBreakingClass<[ 1596 // fp variants. 1597 XORPSrr, XORPDrr, 1598 ANDNPSrr, ANDNPDrr, 1599 1600 // int variants. 1601 PXORrr, 1602 PANDNrr, 1603 PSUBBrr, PSUBWrr, PSUBDrr, PSUBQrr, 1604 PSUBSBrr, PSUBSWrr, 1605 PSUBUSBrr, PSUBUSWrr, 1606 PCMPGTBrr, PCMPGTWrr, PCMPGTDrr, PCMPGTQrr 1607 ], ZeroIdiomPredicate>, 1608 1609 // AVX XMM Zero-idioms. 1610 DepBreakingClass<[ 1611 // fp variants. 1612 VXORPSrr, VXORPDrr, 1613 VANDNPSrr, VANDNPDrr, 1614 1615 // int variants. 1616 VPXORrr, 1617 VPANDNrr, 1618 VPSUBBrr, VPSUBWrr, VPSUBDrr, VPSUBQrr, 1619 VPSUBSBrr, VPSUBSWrr, 1620 VPSUBUSBrr, VPSUBUSWrr, 1621 VPCMPGTBrr, VPCMPGTWrr, VPCMPGTDrr, VPCMPGTQrr, 1622 ], ZeroIdiomPredicate>, 1623 1624 // AVX YMM Zero-idioms. 1625 DepBreakingClass<[ 1626 // fp variants. 1627 VXORPSYrr, VXORPDYrr, 1628 VANDNPSYrr, VANDNPDYrr, 1629 1630 // int variants. 1631 VPXORYrr, 1632 VPANDNYrr, 1633 VPSUBBYrr, VPSUBWYrr, VPSUBDYrr, VPSUBQYrr, 1634 VPSUBSBYrr, VPSUBSWYrr, 1635 VPSUBUSBYrr, VPSUBUSWYrr, 1636 VPCMPGTBYrr, VPCMPGTWYrr, VPCMPGTDYrr, VPCMPGTQYrr 1637 ], ZeroIdiomPredicate>, 1638]>; 1639 1640def : IsDepBreakingFunction<[ 1641 // GPR 1642 DepBreakingClass<[ SBB32rr, SBB32rr_REV, 1643 SBB64rr, SBB64rr_REV ], ZeroIdiomPredicate>, 1644 DepBreakingClass<[ CMP8rr, CMP8rr_REV, 1645 CMP16rr, CMP16rr_REV, 1646 CMP32rr, CMP32rr_REV, 1647 CMP64rr, CMP64rr_REV ], CheckSameRegOperand<0, 1> >, 1648 1649 // MMX 1650 DepBreakingClass<[ 1651 MMX_PCMPEQBirr, MMX_PCMPEQWirr, MMX_PCMPEQDirr 1652 ], ZeroIdiomPredicate>, 1653 1654 // SSE 1655 DepBreakingClass<[ 1656 PCMPEQBrr, PCMPEQWrr, PCMPEQDrr, PCMPEQQrr 1657 ], ZeroIdiomPredicate>, 1658 1659 // AVX XMM 1660 DepBreakingClass<[ 1661 VPCMPEQBrr, VPCMPEQWrr, VPCMPEQDrr, VPCMPEQQrr 1662 ], ZeroIdiomPredicate>, 1663 1664 // AVX YMM 1665 DepBreakingClass<[ 1666 VPCMPEQBYrr, VPCMPEQWYrr, VPCMPEQDYrr, VPCMPEQQYrr 1667 ], ZeroIdiomPredicate>, 1668]>; 1669 1670} // SchedModel 1671