1//=- X86ScheduleZnver3.td - X86 Znver3 Scheduling ------------*- tablegen -*-=// 2// 3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4// See https://llvm.org/LICENSE.txt for license information. 5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6// 7//===----------------------------------------------------------------------===// 8// 9// This file defines the machine model for Znver3 to support instruction 10// scheduling and other instruction cost heuristics. 11// Based on: 12// * AMD Software Optimization Guide for AMD Family 19h Processors. 13// https://www.amd.com/system/files/TechDocs/56665.zip 14// * The microarchitecture of Intel, AMD and VIA CPUs, By Agner Fog 15// http://www.agner.org/optimize/microarchitecture.pdf 16// * AMD Zen 3 Ryzen Deep Dive Review 17// https://www.anandtech.com/show/16214/ 18//===----------------------------------------------------------------------===// 19 20def Znver3Model : SchedMachineModel { 21 // AMD SOG 19h, 2.9.6 Dispatch 22 // The processor may dispatch up to 6 macro ops per cycle 23 // into the execution engine. 24 let IssueWidth = 6; 25 // AMD SOG 19h, 2.10.3 26 // The retire control unit (RCU) tracks the completion status of all 27 // outstanding operations (integer, load/store, and floating-point) and is 28 // the final arbiter for exception processing and recovery. 29 // The unit can receive up to 6 macro ops dispatched per cycle and track up 30 // to 256 macro ops in-flight in non-SMT mode or 128 per thread in SMT mode. 31 let MicroOpBufferSize = 256; 32 // AMD SOG 19h, 2.9.1 Op Cache 33 // The op cache is organized as an associative cache with 64 sets and 8 ways. 34 // At each set-way intersection is an entry containing up to 8 macro ops. 35 // The maximum capacity of the op cache is 4K ops. 36 // Agner, 22.5 µop cache 37 // The size of the µop cache is big enough for holding most critical loops. 38 // FIXME: PR50384: IndVars has quadradic complexity, with large values here 39 // the compilation of certain loops ends up taking way too long. 40 // let LoopMicroOpBufferSize = 4096; 41 let LoopMicroOpBufferSize = 256; 42 // AMD SOG 19h, 2.6.2 L1 Data Cache 43 // The L1 data cache has a 4- or 5- cycle integer load-to-use latency. 44 // AMD SOG 19h, 2.12 L1 Data Cache 45 // The AGU and LS pipelines are optimized for simple address generation modes. 46 // <...> and can achieve 4-cycle load-to-use integer load latency. 47 let LoadLatency = 4; 48 // AMD SOG 19h, 2.12 L1 Data Cache 49 // The AGU and LS pipelines are optimized for simple address generation modes. 50 // <...> and can achieve <...> 7-cycle load-to-use FP load latency. 51 int VecLoadLatency = 7; 52 // Latency of a simple store operation. 53 int StoreLatency = 1; 54 // FIXME 55 let HighLatency = 25; // FIXME: any better choice? 56 // AMD SOG 19h, 2.8 Optimizing Branching 57 // The branch misprediction penalty is in the range from 11 to 18 cycles, 58 // <...>. The common case penalty is 13 cycles. 59 let MispredictPenalty = 13; 60 61 let PostRAScheduler = 1; // Enable Post RegAlloc Scheduler pass. 62 63 let CompleteModel = 1; 64} 65 66let SchedModel = Znver3Model in { 67 68 69//===----------------------------------------------------------------------===// 70// RCU 71//===----------------------------------------------------------------------===// 72 73// AMD SOG 19h, 2.10.3 Retire Control Unit 74// The unit can receive up to 6 macro ops dispatched per cycle and track up to 75// 256 macro ops in-flight in non-SMT mode or 128 per thread in SMT mode. <...> 76// The retire unit handles in-order commit of up to eight macro ops per cycle. 77def Zn3RCU : RetireControlUnit<Znver3Model.MicroOpBufferSize, 8>; 78 79//===----------------------------------------------------------------------===// 80// Units 81//===----------------------------------------------------------------------===// 82 83// There are total of three Units, each one with it's own schedulers. 84 85//===----------------------------------------------------------------------===// 86// Integer Execution Unit 87// 88 89// AMD SOG 19h, 2.4 Superscalar Organization 90// The processor uses four decoupled independent integer scheduler queues, 91// each one servicing one ALU pipeline and one or two other pipelines 92 93// 94// Execution pipes 95//===----------------------------------------------------------------------===// 96 97// AMD SOG 19h, 2.10.2 Execution Units 98// The processor contains 4 general purpose integer execution pipes. 99// Each pipe has an ALU capable of general purpose integer operations. 100def Zn3ALU0 : ProcResource<1>; 101def Zn3ALU1 : ProcResource<1>; 102def Zn3ALU2 : ProcResource<1>; 103def Zn3ALU3 : ProcResource<1>; 104 105// AMD SOG 19h, 2.10.2 Execution Units 106// There is also a separate branch execution unit. 107def Zn3BRU1 : ProcResource<1>; 108 109// AMD SOG 19h, 2.10.2 Execution Units 110// There are three Address Generation Units (AGUs) for all load and store 111// address generation. There are also 3 store data movement units 112// associated with the same schedulers as the AGUs. 113def Zn3AGU0 : ProcResource<1>; 114def Zn3AGU1 : ProcResource<1>; 115def Zn3AGU2 : ProcResource<1>; 116 117// 118// Execution Units 119//===----------------------------------------------------------------------===// 120 121// AMD SOG 19h, 2.10.2 Execution Units 122// ALU0 additionally has divide <...> execution capability. 123defvar Zn3Divider = Zn3ALU0; 124 125// AMD SOG 19h, 2.10.2 Execution Units 126// ALU0 additionally has <...> branch execution capability. 127defvar Zn3BRU0 = Zn3ALU0; 128 129// Integer Multiplication issued on ALU1. 130defvar Zn3Multiplier = Zn3ALU1; 131 132// Execution pipeline grouping 133//===----------------------------------------------------------------------===// 134 135// General ALU operations 136def Zn3ALU0123 : ProcResGroup<[Zn3ALU0, Zn3ALU1, Zn3ALU2, Zn3ALU3]>; 137 138// General AGU operations 139def Zn3AGU012 : ProcResGroup<[Zn3AGU0, Zn3AGU1, Zn3AGU2]>; 140 141// Control flow: jumps, calls 142def Zn3BRU01 : ProcResGroup<[Zn3BRU0, Zn3BRU1]>; 143 144// Everything that isn't control flow, but still needs to access CC register, 145// namely: conditional moves, SETcc. 146def Zn3ALU03 : ProcResGroup<[Zn3ALU0, Zn3ALU3]>; 147 148// Zn3ALU1 handles complex bit twiddling: CRC/PDEP/PEXT 149 150// Simple bit twiddling: bit test, shift/rotate, bit extraction 151def Zn3ALU12 : ProcResGroup<[Zn3ALU1, Zn3ALU2]>; 152 153 154// 155// Scheduling 156//===----------------------------------------------------------------------===// 157 158// AMD SOG 19h, 2.10.3 Retire Control Unit 159// The integer physical register file (PRF) consists of 192 registers. 160def Zn3IntegerPRF : RegisterFile<192, [GR64, CCR], [1, 1], [1, 0], 161 6, // Max moves that can be eliminated per cycle. 162 0>; // Restrict move elimination to zero regs. 163 164// anandtech, The integer scheduler has a 4*24 entry macro op capacity. 165// AMD SOG 19h, 2.10.1 Schedulers 166// The schedulers can receive up to six macro ops per cycle, with a limit of 167// two per scheduler. Each scheduler can issue one micro op per cycle into 168// each of its associated pipelines 169// FIXME: these are 4 separate schedulers, not a single big one. 170def Zn3Int : ProcResGroup<[Zn3ALU0, Zn3AGU0, Zn3BRU0, // scheduler 0 171 Zn3ALU1, Zn3AGU1, // scheduler 1 172 Zn3ALU2, Zn3AGU2, // scheduler 2 173 Zn3ALU3, Zn3BRU1 // scheduler 3 174 ]> { 175 let BufferSize = !mul(4, 24); 176} 177 178 179//===----------------------------------------------------------------------===// 180// Floating-Point Unit 181// 182 183// AMD SOG 19h, 2.4 Superscalar Organization 184// The processor uses <...> two decoupled independent floating point schedulers 185// each servicing two FP pipelines and one store or FP-to-integer pipeline. 186 187// 188// Execution pipes 189//===----------------------------------------------------------------------===// 190 191// AMD SOG 19h, 2.10.1 Schedulers 192// <...>, and six FPU pipes. 193// Agner, 22.10 Floating point execution pipes 194// There are six floating point/vector execution pipes, 195def Zn3FPP0 : ProcResource<1>; 196def Zn3FPP1 : ProcResource<1>; 197def Zn3FPP2 : ProcResource<1>; 198def Zn3FPP3 : ProcResource<1>; 199def Zn3FPP45 : ProcResource<2>; 200 201// 202// Execution Units 203//===----------------------------------------------------------------------===// 204// AMD SOG 19h, 2.11.1 Floating Point Execution Resources 205 206// (v)FMUL*, (v)FMA*, Floating Point Compares, Blendv(DQ) 207defvar Zn3FPFMul0 = Zn3FPP0; 208defvar Zn3FPFMul1 = Zn3FPP1; 209 210// (v)FADD* 211defvar Zn3FPFAdd0 = Zn3FPP2; 212defvar Zn3FPFAdd1 = Zn3FPP3; 213 214// All convert operations except pack/unpack 215defvar Zn3FPFCvt0 = Zn3FPP2; 216defvar Zn3FPFCvt1 = Zn3FPP3; 217 218// All Divide and Square Root except Reciprocal Approximation 219// AMD SOG 19h, 2.11.1 Floating Point Execution Resources 220// FDIV unit can support 2 simultaneous operations in flight 221// even though it occupies a single pipe. 222// FIXME: BufferSize=2 ? 223defvar Zn3FPFDiv = Zn3FPP1; 224 225// Moves and Logical operations on Floating Point Data Types 226defvar Zn3FPFMisc0 = Zn3FPP0; 227defvar Zn3FPFMisc1 = Zn3FPP1; 228defvar Zn3FPFMisc2 = Zn3FPP2; 229defvar Zn3FPFMisc3 = Zn3FPP3; 230 231// Integer Adds, Subtracts, and Compares 232// Some complex VADD operations are not available in all pipes. 233defvar Zn3FPVAdd0 = Zn3FPP0; 234defvar Zn3FPVAdd1 = Zn3FPP1; 235defvar Zn3FPVAdd2 = Zn3FPP2; 236defvar Zn3FPVAdd3 = Zn3FPP3; 237 238// Integer Multiplies, SAD, Blendvb 239defvar Zn3FPVMul0 = Zn3FPP0; 240defvar Zn3FPVMul1 = Zn3FPP3; 241 242// Data Shuffles, Packs, Unpacks, Permute 243// Some complex shuffle operations are only available in pipe1. 244defvar Zn3FPVShuf = Zn3FPP1; 245defvar Zn3FPVShufAux = Zn3FPP2; 246 247// Bit Shift Left/Right operations 248defvar Zn3FPVShift0 = Zn3FPP1; 249defvar Zn3FPVShift1 = Zn3FPP2; 250 251// Moves and Logical operations on Packed Integer Data Types 252defvar Zn3FPVMisc0 = Zn3FPP0; 253defvar Zn3FPVMisc1 = Zn3FPP1; 254defvar Zn3FPVMisc2 = Zn3FPP2; 255defvar Zn3FPVMisc3 = Zn3FPP3; 256 257// *AES* 258defvar Zn3FPAES0 = Zn3FPP0; 259defvar Zn3FPAES1 = Zn3FPP1; 260 261// *CLM* 262defvar Zn3FPCLM0 = Zn3FPP0; 263defvar Zn3FPCLM1 = Zn3FPP1; 264 265// Execution pipeline grouping 266//===----------------------------------------------------------------------===// 267 268// AMD SOG 19h, 2.11 Floating-Point Unit 269// Stores and floating point to general purpose register transfer 270// have 2 dedicated pipelines (pipe 5 and 6). 271def Zn3FPU0123 : ProcResGroup<[Zn3FPP0, Zn3FPP1, Zn3FPP2, Zn3FPP3]>; 272 273// (v)FMUL*, (v)FMA*, Floating Point Compares, Blendv(DQ) 274def Zn3FPFMul01 : ProcResGroup<[Zn3FPFMul0, Zn3FPFMul1]>; 275 276// (v)FADD* 277// Some complex VADD operations are not available in all pipes. 278def Zn3FPFAdd01 : ProcResGroup<[Zn3FPFAdd0, Zn3FPFAdd1]>; 279 280// All convert operations except pack/unpack 281def Zn3FPFCvt01 : ProcResGroup<[Zn3FPFCvt0, Zn3FPFCvt1]>; 282 283// All Divide and Square Root except Reciprocal Approximation 284// def Zn3FPFDiv : ProcResGroup<[Zn3FPFDiv]>; 285 286// Moves and Logical operations on Floating Point Data Types 287def Zn3FPFMisc0123 : ProcResGroup<[Zn3FPFMisc0, Zn3FPFMisc1, Zn3FPFMisc2, Zn3FPFMisc3]>; 288 289def Zn3FPFMisc12 : ProcResGroup<[Zn3FPFMisc1, Zn3FPFMisc2]>; 290 291// Loads, Stores and Move to General Register (EX) Operations 292// AMD SOG 19h, 2.11 Floating-Point Unit 293// Stores and floating point to general purpose register transfer 294// have 2 dedicated pipelines (pipe 5 and 6). 295defvar Zn3FPLd01 = Zn3FPP45; 296 297// AMD SOG 19h, 2.11 Floating-Point Unit 298// Note that FP stores are supported on two pipelines, 299// but throughput is limited to one per cycle. 300let Super = Zn3FPP45 in 301def Zn3FPSt : ProcResource<1>; 302 303// Integer Adds, Subtracts, and Compares 304// Some complex VADD operations are not available in all pipes. 305def Zn3FPVAdd0123 : ProcResGroup<[Zn3FPVAdd0, Zn3FPVAdd1, Zn3FPVAdd2, Zn3FPVAdd3]>; 306 307def Zn3FPVAdd01: ProcResGroup<[Zn3FPVAdd0, Zn3FPVAdd1]>; 308def Zn3FPVAdd12: ProcResGroup<[Zn3FPVAdd1, Zn3FPVAdd2]>; 309 310// Integer Multiplies, SAD, Blendvb 311def Zn3FPVMul01 : ProcResGroup<[Zn3FPVMul0, Zn3FPVMul1]>; 312 313// Data Shuffles, Packs, Unpacks, Permute 314// Some complex shuffle operations are only available in pipe1. 315def Zn3FPVShuf01 : ProcResGroup<[Zn3FPVShuf, Zn3FPVShufAux]>; 316 317// Bit Shift Left/Right operations 318def Zn3FPVShift01 : ProcResGroup<[Zn3FPVShift0, Zn3FPVShift1]>; 319 320// Moves and Logical operations on Packed Integer Data Types 321def Zn3FPVMisc0123 : ProcResGroup<[Zn3FPVMisc0, Zn3FPVMisc1, Zn3FPVMisc2, Zn3FPVMisc3]>; 322 323// *AES* 324def Zn3FPAES01 : ProcResGroup<[Zn3FPAES0, Zn3FPAES1]>; 325 326// *CLM* 327def Zn3FPCLM01 : ProcResGroup<[Zn3FPCLM0, Zn3FPCLM1]>; 328 329 330// 331// Scheduling 332//===----------------------------------------------------------------------===// 333 334// Agner, 21.8 Register renaming and out-of-order schedulers 335// The floating point register file has 160 vector registers 336// of 128 bits each in Zen 1 and 256 bits each in Zen 2. 337// anandtech also confirms this. 338def Zn3FpPRF : RegisterFile<160, [VR64, VR128, VR256], [1, 1, 1], [0, 1, 1], 339 6, // Max moves that can be eliminated per cycle. 340 0>; // Restrict move elimination to zero regs. 341 342// AMD SOG 19h, 2.11 Floating-Point Unit 343// The floating-point scheduler has a 2*32 entry macro op capacity. 344// AMD SOG 19h, 2.11 Floating-Point Unit 345// <...> the scheduler can issue 1 micro op per cycle for each pipe. 346// FIXME: those are two separate schedulers, not a single big one. 347def Zn3FP : ProcResGroup<[Zn3FPP0, Zn3FPP2, /*Zn3FPP4,*/ // scheduler 0 348 Zn3FPP1, Zn3FPP3, Zn3FPP45 /*Zn3FPP5*/ // scheduler 1 349 ]> { 350 let BufferSize = !mul(2, 32); 351} 352 353// AMD SOG 19h, 2.11 Floating-Point Unit 354// Macro ops can be dispatched to the 64 entry Non Scheduling Queue (NSQ) 355// even if floating-point scheduler is full. 356// FIXME: how to model this properly? 357 358 359//===----------------------------------------------------------------------===// 360// Load-Store Unit 361// 362 363// AMD SOG 19h, 2.12 Load-Store Unit 364// The LS unit contains three largely independent pipe-lines 365// enabling the execution of three 256-bit memory operations per cycle. 366def Zn3LSU : ProcResource<3>; 367 368// AMD SOG 19h, 2.12 Load-Store Unit 369// All three memory operations can be loads. 370let Super = Zn3LSU in 371def Zn3Load : ProcResource<3> { 372 // AMD SOG 19h, 2.12 Load-Store Unit 373 // The LS unit can process up to 72 out-of-order loads. 374 let BufferSize = 72; 375} 376 377def Zn3LoadQueue : LoadQueue<Zn3Load>; 378 379// AMD SOG 19h, 2.12 Load-Store Unit 380// A maximum of two of the memory operations can be stores. 381let Super = Zn3LSU in 382def Zn3Store : ProcResource<2> { 383 // AMD SOG 19h, 2.12 Load-Store Unit 384 // The LS unit utilizes a 64-entry store queue (STQ). 385 let BufferSize = 64; 386} 387 388def Zn3StoreQueue : StoreQueue<Zn3Store>; 389 390//===----------------------------------------------------------------------===// 391// Basic helper classes. 392//===----------------------------------------------------------------------===// 393 394// Many SchedWrites are defined in pairs with and without a folded load. 395// Instructions with folded loads are usually micro-fused, so they only appear 396// as two micro-ops when dispatched by the schedulers. 397// This multiclass defines the resource usage for variants with and without 398// folded loads. 399 400multiclass __zn3WriteRes<SchedWrite SchedRW, list<ProcResourceKind> ExePorts, 401 int Lat = 1, list<int> Res = [], int UOps = 1> { 402 def : WriteRes<SchedRW, ExePorts> { 403 let Latency = Lat; 404 let ResourceCycles = Res; 405 let NumMicroOps = UOps; 406 } 407} 408 409multiclass __zn3WriteResPair<X86FoldableSchedWrite SchedRW, 410 list<ProcResourceKind> ExePorts, int Lat, 411 list<int> Res, int UOps, int LoadLat, int LoadUOps, 412 ProcResourceKind AGU, int LoadRes> { 413 defm : __zn3WriteRes<SchedRW, ExePorts, Lat, Res, UOps>; 414 415 defm : __zn3WriteRes<SchedRW.Folded, 416 !listconcat([AGU, Zn3Load], ExePorts), 417 !add(Lat, LoadLat), 418 !if(!and(!empty(Res), !eq(LoadRes, 1)), 419 [], 420 !listconcat([1, LoadRes], 421 !if(!empty(Res), 422 !listsplat(1, !size(ExePorts)), 423 Res))), 424 !add(UOps, LoadUOps)>; 425} 426 427// For classes without folded loads. 428multiclass Zn3WriteResInt<SchedWrite SchedRW, 429 list<ProcResourceKind> ExePorts, int Lat = 1, 430 list<int> Res = [], int UOps = 1> { 431 defm : __zn3WriteRes<SchedRW, ExePorts, Lat, Res, UOps>; 432} 433 434multiclass Zn3WriteResXMM<SchedWrite SchedRW, 435 list<ProcResourceKind> ExePorts, int Lat = 1, 436 list<int> Res = [], int UOps = 1> { 437 defm : __zn3WriteRes<SchedRW, ExePorts, Lat, Res, UOps>; 438} 439 440multiclass Zn3WriteResYMM<SchedWrite SchedRW, 441 list<ProcResourceKind> ExePorts, int Lat = 1, 442 list<int> Res = [], int UOps = 1> { 443 defm : __zn3WriteRes<SchedRW, ExePorts, Lat, Res, UOps>; 444} 445 446// For classes with folded loads. 447multiclass Zn3WriteResIntPair<X86FoldableSchedWrite SchedRW, 448 list<ProcResourceKind> ExePorts, int Lat = 1, 449 list<int> Res = [], int UOps = 1, 450 int LoadUOps = 0, int LoadRes = 1> { 451 defm : __zn3WriteResPair<SchedRW, ExePorts, Lat, Res, UOps, 452 Znver3Model.LoadLatency, 453 LoadUOps, Zn3AGU012, LoadRes>; 454} 455 456multiclass Zn3WriteResXMMPair<X86FoldableSchedWrite SchedRW, 457 list<ProcResourceKind> ExePorts, int Lat = 1, 458 list<int> Res = [], int UOps = 1, 459 int LoadUOps = 0, int LoadRes = 1> { 460 defm : __zn3WriteResPair<SchedRW, ExePorts, Lat, Res, UOps, 461 Znver3Model.VecLoadLatency, 462 LoadUOps, Zn3FPLd01, LoadRes>; 463} 464 465multiclass Zn3WriteResYMMPair<X86FoldableSchedWrite SchedRW, 466 list<ProcResourceKind> ExePorts, int Lat = 1, 467 list<int> Res = [], int UOps = 1, 468 int LoadUOps = 0, int LoadRes = 1> { 469 defm : __zn3WriteResPair<SchedRW, ExePorts, Lat, Res, UOps, 470 Znver3Model.VecLoadLatency, 471 LoadUOps, Zn3FPLd01, LoadRes>; 472} 473 474 475//===----------------------------------------------------------------------===// 476// Here be dragons. 477//===----------------------------------------------------------------------===// 478 479def : ReadAdvance<ReadAfterLd, Znver3Model.LoadLatency>; 480 481def : ReadAdvance<ReadAfterVecLd, Znver3Model.VecLoadLatency>; 482def : ReadAdvance<ReadAfterVecXLd, Znver3Model.VecLoadLatency>; 483def : ReadAdvance<ReadAfterVecYLd, Znver3Model.VecLoadLatency>; 484 485// AMD SOG 19h, 2.11 Floating-Point Unit 486// There is 1 cycle of added latency for a result to cross 487// from F to I or I to F domain. 488def : ReadAdvance<ReadInt2Fpu, -1>; 489 490// Instructions with both a load and a store folded are modeled as a folded 491// load + WriteRMW. 492defm : Zn3WriteResInt<WriteRMW, [Zn3AGU012, Zn3Store], Znver3Model.StoreLatency, [1, 1], 0>; 493 494// Loads, stores, and moves, not folded with other operations. 495defm : Zn3WriteResInt<WriteLoad, [Zn3AGU012, Zn3Load], !add(Znver3Model.LoadLatency, 1), [1, 1], 1>; 496 497def Zn3WriteMOVSlow : SchedWriteRes<[Zn3AGU012, Zn3Load]> { 498 let Latency = !add(Znver3Model.LoadLatency, 1); 499 let ResourceCycles = [3, 1]; 500 let NumMicroOps = 1; 501} 502def : InstRW<[Zn3WriteMOVSlow], (instrs MOV8rm, MOV8rm_NOREX, MOV16rm, MOVSX16rm16, MOVSX16rm32, MOVZX16rm16, MOVSX16rm8, MOVZX16rm8)>; 503 504defm : Zn3WriteResInt<WriteStore, [Zn3AGU012, Zn3Store], Znver3Model.StoreLatency, [1, 2], 1>; 505defm : Zn3WriteResInt<WriteStoreNT, [Zn3AGU012, Zn3Store], Znver3Model.StoreLatency, [1, 2], 1>; 506defm : Zn3WriteResInt<WriteMove, [Zn3ALU0123], 1, [4], 1>; 507 508// Treat misc copies as a move. 509def : InstRW<[WriteMove], (instrs COPY)>; 510 511def Zn3WriteMOVBE16rm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3ALU0123]> { 512 let Latency = Znver3Model.LoadLatency; 513 let ResourceCycles = [1, 1, 4]; 514 let NumMicroOps = 1; 515} 516def : InstRW<[Zn3WriteMOVBE16rm], (instrs MOVBE16rm)>; 517 518def Zn3WriteMOVBEmr : SchedWriteRes<[Zn3ALU0123, Zn3AGU012, Zn3Store]> { 519 let Latency = Znver3Model.StoreLatency; 520 let ResourceCycles = [4, 1, 1]; 521 let NumMicroOps = 2; 522} 523def : InstRW<[Zn3WriteMOVBEmr], (instrs MOVBE16mr, MOVBE32mr, MOVBE64mr)>; 524 525// Arithmetic. 526defm : Zn3WriteResIntPair<WriteALU, [Zn3ALU0123], 1, [1], 1>; // Simple integer ALU op. 527 528def Zn3WriteALUSlow : SchedWriteRes<[Zn3ALU0123]> { 529 let Latency = 1; 530 let ResourceCycles = [4]; 531 let NumMicroOps = 1; 532} 533def : InstRW<[Zn3WriteALUSlow], (instrs ADD8i8, ADD16i16, ADD32i32, ADD64i32, 534 AND8i8, AND16i16, AND32i32, AND64i32, 535 OR8i8, OR16i16, OR32i32, OR64i32, 536 SUB8i8, SUB16i16, SUB32i32, SUB64i32, 537 XOR8i8, XOR16i16, XOR32i32, XOR64i32)>; 538 539def Zn3WriteMoveExtend : SchedWriteRes<[Zn3ALU0123]> { 540 let Latency = 1; 541 let ResourceCycles = [4]; 542 let NumMicroOps = 1; 543} 544def : InstRW<[Zn3WriteMoveExtend], (instrs MOVSX16rr16, MOVSX16rr32, MOVZX16rr16, MOVSX16rr8, MOVZX16rr8)>; 545 546def Zn3WriteMaterialize32bitImm: SchedWriteRes<[Zn3ALU0123]> { 547 let Latency = 1; 548 let ResourceCycles = [2]; 549 let NumMicroOps = 1; 550} 551def : InstRW<[Zn3WriteMaterialize32bitImm], (instrs MOV32ri, MOV32ri_alt, MOV64ri32)>; 552 553def Zn3WritePDEP_PEXT : SchedWriteRes<[Zn3ALU1]> { 554 let Latency = 3; 555 let ResourceCycles = [1]; 556 let NumMicroOps = 1; 557} 558def : InstRW<[Zn3WritePDEP_PEXT], (instrs PDEP32rr, PDEP64rr, 559 PEXT32rr, PEXT64rr)>; 560 561defm : Zn3WriteResIntPair<WriteADC, [Zn3ALU0123], 1, [4], 1>; // Integer ALU + flags op. 562 563def Zn3WriteADC8mr_SBB8mr : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3ALU0123, Zn3Store]> { 564 let Latency = 1; 565 let ResourceCycles = [1, 1, 7, 1]; 566 let NumMicroOps = 1; 567} 568def : InstRW<[Zn3WriteADC8mr_SBB8mr], (instrs ADC8mr, SBB8mr)>; 569 570// This is for simple LEAs with one or two input operands. 571defm : Zn3WriteResInt<WriteLEA, [Zn3AGU012], 1, [1], 1>; // LEA instructions can't fold loads. 572 573// This write is used for slow LEA instructions. 574def Zn3Write3OpsLEA : SchedWriteRes<[Zn3ALU0123]> { 575 let Latency = 2; 576 let ResourceCycles = [1]; 577 let NumMicroOps = 2; 578} 579 580// On Piledriver, a slow LEA is either a 3Ops LEA (base, index, offset), 581// or an LEA with a `Scale` value different than 1. 582def Zn3SlowLEAPredicate : MCSchedPredicate< 583 CheckAny<[ 584 // A 3-operand LEA (base, index, offset). 585 IsThreeOperandsLEAFn, 586 // An LEA with a "Scale" different than 1. 587 CheckAll<[ 588 CheckIsImmOperand<2>, 589 CheckNot<CheckImmOperand<2, 1>> 590 ]> 591 ]> 592>; 593 594def Zn3WriteLEA : SchedWriteVariant<[ 595 SchedVar<Zn3SlowLEAPredicate, [Zn3Write3OpsLEA]>, 596 SchedVar<NoSchedPred, [WriteLEA]> 597]>; 598 599def : InstRW<[Zn3WriteLEA], (instrs LEA32r, LEA64r, LEA64_32r)>; 600 601def Zn3SlowLEA16r : SchedWriteRes<[Zn3ALU0123]> { 602 let Latency = 2; // FIXME: not from llvm-exegesis 603 let ResourceCycles = [4]; 604 let NumMicroOps = 2; 605} 606 607def : InstRW<[Zn3SlowLEA16r], (instrs LEA16r)>; 608 609// Integer multiplication 610defm : Zn3WriteResIntPair<WriteIMul8, [Zn3Multiplier], 3, [3], 1>; // Integer 8-bit multiplication. 611defm : Zn3WriteResIntPair<WriteIMul16, [Zn3Multiplier], 3, [3], 3, /*LoadUOps=*/1>; // Integer 16-bit multiplication. 612defm : Zn3WriteResIntPair<WriteIMul16Imm, [Zn3Multiplier], 4, [4], 2>; // Integer 16-bit multiplication by immediate. 613defm : Zn3WriteResIntPair<WriteIMul16Reg, [Zn3Multiplier], 3, [1], 1>; // Integer 16-bit multiplication by register. 614defm : Zn3WriteResIntPair<WriteIMul32, [Zn3Multiplier], 3, [3], 2>; // Integer 32-bit multiplication. 615 616def Zn3MULX32rr : SchedWriteRes<[Zn3Multiplier]> { 617 let Latency = 4; 618 let ResourceCycles = [1]; 619 let NumMicroOps = 2; 620} 621def : InstRW<[Zn3MULX32rr, WriteIMulH], (instrs MULX32rr)>; 622 623def Zn3MULX32rm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3Multiplier]> { 624 let Latency = !add(Znver3Model.LoadLatency, Zn3MULX32rr.Latency); 625 let ResourceCycles = [1, 1, 2]; 626 let NumMicroOps = Zn3MULX32rr.NumMicroOps; 627} 628def : InstRW<[Zn3MULX32rm, WriteIMulH], (instrs MULX32rm)>; 629 630defm : Zn3WriteResIntPair<WriteIMul32Imm, [Zn3Multiplier], 3, [1], 1>; // Integer 32-bit multiplication by immediate. 631defm : Zn3WriteResIntPair<WriteIMul32Reg, [Zn3Multiplier], 3, [1], 1>; // Integer 32-bit multiplication by register. 632defm : Zn3WriteResIntPair<WriteIMul64, [Zn3Multiplier], 3, [3], 2>; // Integer 64-bit multiplication. 633 634def Zn3MULX64rr : SchedWriteRes<[Zn3Multiplier]> { 635 let Latency = 4; 636 let ResourceCycles = [1]; 637 let NumMicroOps = 2; 638} 639def : InstRW<[Zn3MULX64rr, WriteIMulH], (instrs MULX64rr)>; 640 641def Zn3MULX64rm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3Multiplier]> { 642 let Latency = !add(Znver3Model.LoadLatency, Zn3MULX64rr.Latency); 643 let ResourceCycles = [1, 1, 2]; 644 let NumMicroOps = Zn3MULX64rr.NumMicroOps; 645} 646def : InstRW<[Zn3MULX64rm, WriteIMulH], (instrs MULX64rm)>; 647 648defm : Zn3WriteResIntPair<WriteIMul64Imm, [Zn3Multiplier], 3, [1], 1>; // Integer 64-bit multiplication by immediate. 649defm : Zn3WriteResIntPair<WriteIMul64Reg, [Zn3Multiplier], 3, [1], 1>; // Integer 64-bit multiplication by register. 650defm : Zn3WriteResInt<WriteIMulH, [], 4, [], 0>; // Integer multiplication, high part. 651 652defm : Zn3WriteResInt<WriteBSWAP32, [Zn3ALU0123], 1, [1], 1>; // Byte Order (Endianness) 32-bit Swap. 653defm : Zn3WriteResInt<WriteBSWAP64, [Zn3ALU0123], 1, [1], 1>; // Byte Order (Endianness) 64-bit Swap. 654 655defm : Zn3WriteResIntPair<WriteCMPXCHG, [Zn3ALU0123], 3, [12], 5>; // Compare and set, compare and swap. 656 657def Zn3WriteCMPXCHG8rr : SchedWriteRes<[Zn3ALU0123]> { 658 let Latency = 3; 659 let ResourceCycles = [12]; 660 let NumMicroOps = 3; 661} 662def : InstRW<[Zn3WriteCMPXCHG8rr], (instrs CMPXCHG8rr)>; 663 664defm : Zn3WriteResInt<WriteCMPXCHGRMW, [Zn3ALU0123], 3, [12], 6>; // Compare and set, compare and swap. 665 666def Zn3WriteCMPXCHG8rm_LCMPXCHG8 : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3ALU0123]> { 667 let Latency = !add(Znver3Model.LoadLatency, Zn3WriteCMPXCHG8rr.Latency); 668 let ResourceCycles = [1, 1, 12]; 669 let NumMicroOps = !add(Zn3WriteCMPXCHG8rr.NumMicroOps, 2); 670} 671def : InstRW<[Zn3WriteCMPXCHG8rm_LCMPXCHG8], (instrs CMPXCHG8rm, LCMPXCHG8)>; 672 673def Zn3WriteCMPXCHG8B : SchedWriteRes<[Zn3ALU0123]> { 674 let Latency = 3; // FIXME: not from llvm-exegesis 675 let ResourceCycles = [24]; 676 let NumMicroOps = 19; 677} 678def : InstRW<[Zn3WriteCMPXCHG8B], (instrs CMPXCHG8B)>; 679 680def Zn3WriteCMPXCHG16B_LCMPXCHG16B : SchedWriteRes<[Zn3ALU0123]> { 681 let Latency = 4; // FIXME: not from llvm-exegesis 682 let ResourceCycles = [59]; 683 let NumMicroOps = 28; 684} 685def : InstRW<[Zn3WriteCMPXCHG16B_LCMPXCHG16B], (instrs CMPXCHG16B, LCMPXCHG16B)>; 686 687def Zn3WriteWriteXCHGUnrenameable : SchedWriteRes<[Zn3ALU0123]> { 688 let Latency = 1; 689 let ResourceCycles = [2]; 690 let NumMicroOps = 2; 691} 692def : InstRW<[Zn3WriteWriteXCHGUnrenameable], (instrs XCHG8rr, XCHG16rr, XCHG16ar)>; 693 694def Zn3WriteXCHG8rm_XCHG16rm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3ALU0123]> { 695 let Latency = !add(Znver3Model.LoadLatency, 3); // FIXME: not from llvm-exegesis 696 let ResourceCycles = [1, 1, 2]; 697 let NumMicroOps = 5; 698} 699def : InstRW<[Zn3WriteXCHG8rm_XCHG16rm], (instrs XCHG8rm, XCHG16rm)>; 700 701def Zn3WriteXCHG32rm_XCHG64rm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3ALU0123]> { 702 let Latency = !add(Znver3Model.LoadLatency, 2); // FIXME: not from llvm-exegesis 703 let ResourceCycles = [1, 1, 2]; 704 let NumMicroOps = 2; 705} 706def : InstRW<[Zn3WriteXCHG32rm_XCHG64rm], (instrs XCHG32rm, XCHG64rm)>; 707 708// Integer division. 709// FIXME: uops for 8-bit division measures as 2. for others it's a guess. 710// FIXME: latency for 8-bit division measures as 10. for others it's a guess. 711defm : Zn3WriteResIntPair<WriteDiv8, [Zn3Divider], 10, [10], 2>; 712defm : Zn3WriteResIntPair<WriteDiv16, [Zn3Divider], 11, [11], 2>; 713defm : Zn3WriteResIntPair<WriteDiv32, [Zn3Divider], 13, [13], 2>; 714defm : Zn3WriteResIntPair<WriteDiv64, [Zn3Divider], 17, [17], 2>; 715defm : Zn3WriteResIntPair<WriteIDiv8, [Zn3Divider], 10, [10], 2>; 716defm : Zn3WriteResIntPair<WriteIDiv16, [Zn3Divider], 11, [11], 2>; 717defm : Zn3WriteResIntPair<WriteIDiv32, [Zn3Divider], 13, [13], 2>; 718defm : Zn3WriteResIntPair<WriteIDiv64, [Zn3Divider], 17, [17], 2>; 719 720defm : Zn3WriteResIntPair<WriteBSF, [Zn3ALU1], 3, [3], 6, /*LoadUOps=*/2>; // Bit scan forward. 721defm : Zn3WriteResIntPair<WriteBSR, [Zn3ALU1], 4, [4], 6, /*LoadUOps=*/2>; // Bit scan reverse. 722 723defm : Zn3WriteResIntPair<WritePOPCNT, [Zn3ALU0123], 1, [1], 1>; // Bit population count. 724 725def Zn3WritePOPCNT16rr : SchedWriteRes<[Zn3ALU0123]> { 726 let Latency = 1; 727 let ResourceCycles = [4]; 728 let NumMicroOps = 1; 729} 730def : InstRW<[Zn3WritePOPCNT16rr], (instrs POPCNT16rr)>; 731 732defm : Zn3WriteResIntPair<WriteLZCNT, [Zn3ALU0123], 1, [1], 1>; // Leading zero count. 733 734def Zn3WriteLZCNT16rr : SchedWriteRes<[Zn3ALU0123]> { 735 let Latency = 1; 736 let ResourceCycles = [4]; 737 let NumMicroOps = 1; 738} 739def : InstRW<[Zn3WriteLZCNT16rr], (instrs LZCNT16rr)>; 740 741defm : Zn3WriteResIntPair<WriteTZCNT, [Zn3ALU12], 2, [1], 2>; // Trailing zero count. 742 743def Zn3WriteTZCNT16rr : SchedWriteRes<[Zn3ALU0123]> { 744 let Latency = 2; 745 let ResourceCycles = [4]; 746 let NumMicroOps = 2; 747} 748def : InstRW<[Zn3WriteTZCNT16rr], (instrs TZCNT16rr)>; 749 750defm : Zn3WriteResIntPair<WriteCMOV, [Zn3ALU03], 1, [1], 1>; // Conditional move. 751defm : Zn3WriteResInt<WriteFCMOV, [Zn3ALU0123], 7, [28], 7>; // FIXME: not from llvm-exegesis // X87 conditional move. 752defm : Zn3WriteResInt<WriteSETCC, [Zn3ALU03], 1, [2], 1>; // Set register based on condition code. 753defm : Zn3WriteResInt<WriteSETCCStore, [Zn3ALU03, Zn3AGU012, Zn3Store], 2, [2, 1, 1], 2>; // FIXME: latency not from llvm-exegesis 754defm : Zn3WriteResInt<WriteLAHFSAHF, [Zn3ALU3], 1, [1], 1>; // Load/Store flags in AH. 755 756defm : Zn3WriteResInt<WriteBitTest, [Zn3ALU12], 1, [1], 1>; // Bit Test 757defm : Zn3WriteResInt<WriteBitTestImmLd, [Zn3AGU012, Zn3Load, Zn3ALU12], !add(Znver3Model.LoadLatency, 1), [1, 1, 1], 2>; 758defm : Zn3WriteResInt<WriteBitTestRegLd, [Zn3AGU012, Zn3Load, Zn3ALU12], !add(Znver3Model.LoadLatency, 1), [1, 1, 1], 7>; 759 760defm : Zn3WriteResInt<WriteBitTestSet, [Zn3ALU12], 2, [2], 2>; // Bit Test + Set 761defm : Zn3WriteResInt<WriteBitTestSetImmLd, [Zn3AGU012, Zn3Load, Zn3ALU12], !add(Znver3Model.LoadLatency, 2), [1, 1, 1], 4>; 762defm : Zn3WriteResInt<WriteBitTestSetRegLd, [Zn3AGU012, Zn3Load, Zn3ALU12], !add(Znver3Model.LoadLatency, 2), [1, 1, 1], 9>; 763 764// Integer shifts and rotates. 765defm : Zn3WriteResIntPair<WriteShift, [Zn3ALU12], 1, [1], 1, /*LoadUOps=*/1>; 766defm : Zn3WriteResIntPair<WriteShiftCL, [Zn3ALU12], 1, [1], 1, /*LoadUOps=*/1>; 767defm : Zn3WriteResIntPair<WriteRotate, [Zn3ALU12], 1, [1], 1, /*LoadUOps=*/1>; 768 769def Zn3WriteRotateR1 : SchedWriteRes<[Zn3ALU12]> { 770 let Latency = 1; 771 let ResourceCycles = [2]; 772 let NumMicroOps = 1; 773} 774def : InstRW<[Zn3WriteRotateR1], (instrs RCL8r1, RCL16r1, RCL32r1, RCL64r1, 775 RCR8r1, RCR16r1, RCR32r1, RCR64r1)>; 776 777def Zn3WriteRotateM1 : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3ALU12]> { 778 let Latency = !add(Znver3Model.LoadLatency, Zn3WriteRotateR1.Latency); 779 let ResourceCycles = [1, 1, 2]; 780 let NumMicroOps = !add(Zn3WriteRotateR1.NumMicroOps, 1); 781} 782def : InstRW<[Zn3WriteRotateM1], (instrs RCL8m1, RCL16m1, RCL32m1, RCL64m1, 783 RCR8m1, RCR16m1, RCR32m1, RCR64m1)>; 784 785def Zn3WriteRotateRightRI : SchedWriteRes<[Zn3ALU12]> { 786 let Latency = 3; 787 let ResourceCycles = [6]; 788 let NumMicroOps = 7; 789} 790def : InstRW<[Zn3WriteRotateRightRI], (instrs RCR8ri, RCR16ri, RCR32ri, RCR64ri)>; 791 792def Zn3WriteRotateRightMI : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3ALU12]> { 793 let Latency = !add(Znver3Model.LoadLatency, Zn3WriteRotateRightRI.Latency); 794 let ResourceCycles = [1, 1, 8]; 795 let NumMicroOps = !add(Zn3WriteRotateRightRI.NumMicroOps, 3); 796} 797def : InstRW<[Zn3WriteRotateRightMI], (instrs RCR8mi, RCR16mi, RCR32mi, RCR64mi)>; 798 799def Zn3WriteRotateLeftRI : SchedWriteRes<[Zn3ALU12]> { 800 let Latency = 4; 801 let ResourceCycles = [8]; 802 let NumMicroOps = 9; 803} 804def : InstRW<[Zn3WriteRotateLeftRI], (instrs RCL8ri, RCL16ri, RCL32ri, RCL64ri)>; 805 806def Zn3WriteRotateLeftMI : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3ALU12]> { 807 let Latency = !add(Znver3Model.LoadLatency, Zn3WriteRotateLeftRI.Latency); 808 let ResourceCycles = [1, 1, 8]; 809 let NumMicroOps = !add(Zn3WriteRotateLeftRI.NumMicroOps, 2); 810} 811def : InstRW<[Zn3WriteRotateLeftMI], (instrs RCL8mi, RCL16mi, RCL32mi, RCL64mi)>; 812 813defm : Zn3WriteResIntPair<WriteRotateCL, [Zn3ALU12], 1, [1], 1, /*LoadUOps=*/1>; 814 815def Zn3WriteRotateRightRCL : SchedWriteRes<[Zn3ALU12]> { 816 let Latency = 3; 817 let ResourceCycles = [6]; 818 let NumMicroOps = 7; 819} 820def : InstRW<[Zn3WriteRotateRightRCL], (instrs RCR8rCL, RCR16rCL, RCR32rCL, RCR64rCL)>; 821 822def Zn3WriteRotateRightMCL : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3ALU12]> { 823 let Latency = !add(Znver3Model.LoadLatency, Zn3WriteRotateRightRCL.Latency); 824 let ResourceCycles = [1, 1, 8]; 825 let NumMicroOps = !add(Zn3WriteRotateRightRCL.NumMicroOps, 2); 826} 827def : InstRW<[Zn3WriteRotateRightMCL], (instrs RCR8mCL, RCR16mCL, RCR32mCL, RCR64mCL)>; 828 829def Zn3WriteRotateLeftRCL : SchedWriteRes<[Zn3ALU12]> { 830 let Latency = 4; 831 let ResourceCycles = [8]; 832 let NumMicroOps = 9; 833} 834def : InstRW<[Zn3WriteRotateLeftRCL], (instrs RCL8rCL, RCL16rCL, RCL32rCL, RCL64rCL)>; 835 836def Zn3WriteRotateLeftMCL : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3ALU12]> { 837 let Latency = !add(Znver3Model.LoadLatency, Zn3WriteRotateLeftRCL.Latency); 838 let ResourceCycles = [1, 1, 8]; 839 let NumMicroOps = !add(Zn3WriteRotateLeftRCL.NumMicroOps, 2); 840} 841def : InstRW<[Zn3WriteRotateLeftMCL], (instrs RCL8mCL, RCL16mCL, RCL32mCL, RCL64mCL)>; 842 843// Double shift instructions. 844defm : Zn3WriteResInt<WriteSHDrri, [Zn3ALU12], 2, [3], 4>; 845defm : Zn3WriteResInt<WriteSHDrrcl, [Zn3ALU12], 2, [3], 5>; 846defm : Zn3WriteResInt<WriteSHDmri, [Zn3AGU012, Zn3Load, Zn3ALU12], !add(Znver3Model.LoadLatency, 2), [1, 1, 4], 6>; 847defm : Zn3WriteResInt<WriteSHDmrcl, [Zn3AGU012, Zn3Load, Zn3ALU12], !add(Znver3Model.LoadLatency, 2), [1, 1, 4], 6>; 848 849// BMI1 BEXTR/BLS, BMI2 BZHI 850defm : Zn3WriteResIntPair<WriteBEXTR, [Zn3ALU12], 1, [1], 1, /*LoadUOps=*/1>; 851defm : Zn3WriteResIntPair<WriteBLS, [Zn3ALU0123], 2, [2], 2, /*LoadUOps=*/1>; 852defm : Zn3WriteResIntPair<WriteBZHI, [Zn3ALU12], 1, [1], 1, /*LoadUOps=*/1>; 853 854// Idioms that clear a register, like xorps %xmm0, %xmm0. 855// These can often bypass execution ports completely. 856defm : Zn3WriteResInt<WriteZero, [Zn3ALU0123], 0, [0], 1>; 857 858// Branches don't produce values, so they have no latency, but they still 859// consume resources. Indirect branches can fold loads. 860defm : Zn3WriteResIntPair<WriteJump, [Zn3BRU01], 1, [1], 1>; // FIXME: not from llvm-exegesis 861 862// Floating point. This covers both scalar and vector operations. 863defm : Zn3WriteResInt<WriteFLD0, [Zn3FPLd01, Zn3Load, Zn3FPP1], !add(Znver3Model.LoadLatency, 4), [1, 1, 1], 1>; 864defm : Zn3WriteResInt<WriteFLD1, [Zn3FPLd01, Zn3Load, Zn3FPP1], !add(Znver3Model.LoadLatency, 7), [1, 1, 1], 1>; 865defm : Zn3WriteResInt<WriteFLDC, [Zn3FPLd01, Zn3Load, Zn3FPP1], !add(Znver3Model.LoadLatency, 7), [1, 1, 1], 1>; 866defm : Zn3WriteResXMM<WriteFLoad, [Zn3FPLd01, Zn3Load], !add(Znver3Model.VecLoadLatency, 1), [1, 1], 1>; 867defm : Zn3WriteResXMM<WriteFLoadX, [Zn3FPLd01, Zn3Load], !add(Znver3Model.VecLoadLatency, 1), [1, 1], 1>; 868defm : Zn3WriteResYMM<WriteFLoadY, [Zn3FPLd01, Zn3Load], !add(Znver3Model.VecLoadLatency, 1), [1, 1], 1>; 869defm : Zn3WriteResXMM<WriteFMaskedLoad, [Zn3FPLd01, Zn3Load], !add(Znver3Model.VecLoadLatency, 1), [1, 1], 1>; 870defm : Zn3WriteResYMM<WriteFMaskedLoadY, [Zn3FPLd01, Zn3Load], !add(Znver3Model.VecLoadLatency, 1), [1, 1], 1>; 871defm : Zn3WriteResXMM<WriteFStore, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [1, 1], 1>; 872 873def Zn3WriteWriteFStoreMMX : SchedWriteRes<[Zn3FPSt, Zn3Store]> { 874 let Latency = 2; // FIXME: not from llvm-exegesis 875 let ResourceCycles = [1, 1]; 876 let NumMicroOps = 2; 877} 878def : InstRW<[Zn3WriteWriteFStoreMMX], (instrs MOVHPDmr, MOVHPSmr, 879 VMOVHPDmr, VMOVHPSmr)>; 880 881defm : Zn3WriteResXMM<WriteFStoreX, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [1, 1], 1>; 882defm : Zn3WriteResYMM<WriteFStoreY, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [1, 1], 1>; 883defm : Zn3WriteResXMM<WriteFStoreNT, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [1, 1], 1>; 884defm : Zn3WriteResXMM<WriteFStoreNTX, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [1, 1], 1>; 885defm : Zn3WriteResYMM<WriteFStoreNTY, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [1, 1], 1>; 886 887defm : Zn3WriteResXMM<WriteFMaskedStore32, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [6, 1], 18>; 888defm : Zn3WriteResXMM<WriteFMaskedStore64, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [4, 1], 10>; 889defm : Zn3WriteResYMM<WriteFMaskedStore32Y, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [12, 1], 42>; 890defm : Zn3WriteResYMM<WriteFMaskedStore64Y, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [6, 1], 18>; 891 892defm : Zn3WriteResXMMPair<WriteFAdd, [Zn3FPFAdd01], 3, [1], 1>; // Floating point add/sub. 893 894def Zn3WriteX87Arith : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPU0123]> { 895 let Latency = !add(Znver3Model.LoadLatency, 1); // FIXME: not from llvm-exegesis 896 let ResourceCycles = [1, 1, 24]; 897 let NumMicroOps = 2; 898} 899def : InstRW<[Zn3WriteX87Arith], (instrs ADD_FI16m, ADD_FI32m, 900 SUB_FI16m, SUB_FI32m, 901 SUBR_FI16m, SUBR_FI32m, 902 MUL_FI16m, MUL_FI32m)>; 903 904def Zn3WriteX87Div : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPU0123]> { 905 let Latency = !add(Znver3Model.LoadLatency, 1); // FIXME: not from llvm-exegesis 906 let ResourceCycles = [1, 1, 62]; 907 let NumMicroOps = 2; 908} 909def : InstRW<[Zn3WriteX87Div], (instrs DIV_FI16m, DIV_FI32m, 910 DIVR_FI16m, DIVR_FI32m)>; 911 912defm : Zn3WriteResXMMPair<WriteFAddX, [Zn3FPFAdd01], 3, [1], 1>; // Floating point add/sub (XMM). 913defm : Zn3WriteResYMMPair<WriteFAddY, [Zn3FPFAdd01], 3, [1], 1>; // Floating point add/sub (YMM). 914defm : X86WriteResPairUnsupported<WriteFAddZ>; // Floating point add/sub (ZMM). 915defm : Zn3WriteResXMMPair<WriteFAdd64, [Zn3FPFAdd01], 3, [1], 1>; // Floating point double add/sub. 916defm : Zn3WriteResXMMPair<WriteFAdd64X, [Zn3FPFAdd01], 3, [1], 1>; // Floating point double add/sub (XMM). 917defm : Zn3WriteResYMMPair<WriteFAdd64Y, [Zn3FPFAdd01], 3, [1], 1>; // Floating point double add/sub (YMM). 918defm : X86WriteResPairUnsupported<WriteFAdd64Z>; // Floating point double add/sub (ZMM). 919defm : Zn3WriteResXMMPair<WriteFCmp, [Zn3FPFMul01], 1, [1], 1>; // Floating point compare. 920defm : Zn3WriteResXMMPair<WriteFCmpX, [Zn3FPFMul01], 1, [1], 1>; // Floating point compare (XMM). 921defm : Zn3WriteResYMMPair<WriteFCmpY, [Zn3FPFMul01], 1, [1], 1>; // Floating point compare (YMM). 922defm : X86WriteResPairUnsupported<WriteFCmpZ>; // Floating point compare (ZMM). 923defm : Zn3WriteResXMMPair<WriteFCmp64, [Zn3FPFMul01], 1, [1], 1>; // Floating point double compare. 924defm : Zn3WriteResXMMPair<WriteFCmp64X, [Zn3FPFMul01], 1, [1], 1>; // Floating point double compare (XMM). 925defm : Zn3WriteResYMMPair<WriteFCmp64Y, [Zn3FPFMul01], 1, [1], 1>; // Floating point double compare (YMM). 926defm : X86WriteResPairUnsupported<WriteFCmp64Z>; // Floating point double compare (ZMM). 927defm : Zn3WriteResXMMPair<WriteFCom, [Zn3FPFMul01], 3, [2], 1>; // FIXME: latency not from llvm-exegesis // Floating point compare to flags (X87). 928defm : Zn3WriteResXMMPair<WriteFComX, [Zn3FPFMul01], 4, [2], 2>; // FIXME: latency not from llvm-exegesis // Floating point compare to flags (SSE). 929defm : Zn3WriteResXMMPair<WriteFMul, [Zn3FPFMul01], 3, [1], 1>; // Floating point multiplication. 930defm : Zn3WriteResXMMPair<WriteFMulX, [Zn3FPFMul01], 3, [1], 1>; // Floating point multiplication (XMM). 931defm : Zn3WriteResYMMPair<WriteFMulY, [Zn3FPFMul01], 3, [1], 1>; // Floating point multiplication (YMM). 932defm : X86WriteResPairUnsupported<WriteFMulZ>; // Floating point multiplication (YMM). 933defm : Zn3WriteResXMMPair<WriteFMul64, [Zn3FPFMul01], 3, [1], 1>; // Floating point double multiplication. 934defm : Zn3WriteResXMMPair<WriteFMul64X, [Zn3FPFMul01], 3, [1], 1>; // Floating point double multiplication (XMM). 935defm : Zn3WriteResYMMPair<WriteFMul64Y, [Zn3FPFMul01], 3, [1], 1>; // Floating point double multiplication (YMM). 936defm : X86WriteResPairUnsupported<WriteFMul64Z>; // Floating point double multiplication (ZMM). 937defm : Zn3WriteResXMMPair<WriteFDiv, [Zn3FPFDiv], 11, [3], 1>; // Floating point division. 938defm : Zn3WriteResXMMPair<WriteFDivX, [Zn3FPFDiv], 11, [3], 1>; // Floating point division (XMM). 939defm : Zn3WriteResYMMPair<WriteFDivY, [Zn3FPFDiv], 11, [3], 1>; // Floating point division (YMM). 940defm : X86WriteResPairUnsupported<WriteFDivZ>; // Floating point division (ZMM). 941defm : Zn3WriteResXMMPair<WriteFDiv64, [Zn3FPFDiv], 13, [5], 1>; // Floating point double division. 942defm : Zn3WriteResXMMPair<WriteFDiv64X, [Zn3FPFDiv], 13, [5], 1>; // Floating point double division (XMM). 943defm : Zn3WriteResYMMPair<WriteFDiv64Y, [Zn3FPFDiv], 13, [5], 1>; // Floating point double division (YMM). 944defm : X86WriteResPairUnsupported<WriteFDiv64Z>; // Floating point double division (ZMM). 945defm : Zn3WriteResXMMPair<WriteFSqrt, [Zn3FPFDiv], 15, [5], 1>; // Floating point square root. 946defm : Zn3WriteResXMMPair<WriteFSqrtX, [Zn3FPFDiv], 15, [5], 1>; // Floating point square root (XMM). 947defm : Zn3WriteResYMMPair<WriteFSqrtY, [Zn3FPFDiv], 15, [5], 1>; // Floating point square root (YMM). 948defm : X86WriteResPairUnsupported<WriteFSqrtZ>; // Floating point square root (ZMM). 949defm : Zn3WriteResXMMPair<WriteFSqrt64, [Zn3FPFDiv], 21, [9], 1>; // Floating point double square root. 950defm : Zn3WriteResXMMPair<WriteFSqrt64X, [Zn3FPFDiv], 21, [9], 1>; // Floating point double square root (XMM). 951defm : Zn3WriteResYMMPair<WriteFSqrt64Y, [Zn3FPFDiv], 21, [9], 1>; // Floating point double square root (YMM). 952defm : X86WriteResPairUnsupported<WriteFSqrt64Z>; // Floating point double square root (ZMM). 953defm : Zn3WriteResXMMPair<WriteFSqrt80, [Zn3FPFDiv], 22, [23], 1>; // FIXME: latency not from llvm-exegesis // Floating point long double square root. 954defm : Zn3WriteResXMMPair<WriteFRcp, [Zn3FPFMul01], 3, [1], 1>; // Floating point reciprocal estimate. 955defm : Zn3WriteResXMMPair<WriteFRcpX, [Zn3FPFMul01], 3, [1], 1>; // Floating point reciprocal estimate (XMM). 956defm : Zn3WriteResYMMPair<WriteFRcpY, [Zn3FPFMul01], 3, [1], 1>; // Floating point reciprocal estimate (YMM). 957defm : X86WriteResPairUnsupported<WriteFRcpZ>; // Floating point reciprocal estimate (ZMM). 958defm : Zn3WriteResXMMPair<WriteFRsqrt, [Zn3FPFDiv], 3, [1], 1>; // Floating point reciprocal square root estimate. 959defm : Zn3WriteResXMMPair<WriteFRsqrtX, [Zn3FPFDiv], 3, [1], 1>; // Floating point reciprocal square root estimate (XMM). 960defm : Zn3WriteResYMMPair<WriteFRsqrtY, [Zn3FPFDiv], 3, [1], 1>; // Floating point reciprocal square root estimate (YMM). 961defm : X86WriteResPairUnsupported<WriteFRsqrtZ>; // Floating point reciprocal square root estimate (ZMM). 962defm : Zn3WriteResXMMPair<WriteFMA, [Zn3FPFMul01], 4, [2], 1>; // Fused Multiply Add. 963defm : Zn3WriteResXMMPair<WriteFMAX, [Zn3FPFMul01], 4, [2], 1>; // Fused Multiply Add (XMM). 964defm : Zn3WriteResYMMPair<WriteFMAY, [Zn3FPFMul01], 4, [2], 1>; // Fused Multiply Add (YMM). 965defm : X86WriteResPairUnsupported<WriteFMAZ>; // Fused Multiply Add (ZMM). 966defm : Zn3WriteResXMMPair<WriteDPPD, [Zn3FPFMul01], 9, [6], 3, /*LoadUOps=*/2>; // Floating point double dot product. 967defm : Zn3WriteResXMMPair<WriteDPPS, [Zn3FPFMul01], 15, [8], 8, /*LoadUOps=*/2>; // Floating point single dot product. 968defm : Zn3WriteResYMMPair<WriteDPPSY, [Zn3FPFMul01], 15, [8], 7, /*LoadUOps=*/1>; // Floating point single dot product (YMM). 969defm : X86WriteResPairUnsupported<WriteDPPSZ>; // Floating point single dot product (ZMM). 970defm : Zn3WriteResXMMPair<WriteFSign, [Zn3FPFMul01], 1, [2], 1>; // FIXME: latency not from llvm-exegesis // Floating point fabs/fchs. 971defm : Zn3WriteResXMMPair<WriteFRnd, [Zn3FPFCvt01], 3, [1], 1>; // Floating point rounding. 972defm : Zn3WriteResYMMPair<WriteFRndY, [Zn3FPFCvt01], 3, [1], 1>; // Floating point rounding (YMM). 973defm : X86WriteResPairUnsupported<WriteFRndZ>; // Floating point rounding (ZMM). 974defm : Zn3WriteResXMMPair<WriteFLogic, [Zn3FPVMisc0123], 1, [1], 1>; // Floating point and/or/xor logicals. 975defm : Zn3WriteResYMMPair<WriteFLogicY, [Zn3FPVMisc0123], 1, [1], 1>; // Floating point and/or/xor logicals (YMM). 976defm : X86WriteResPairUnsupported<WriteFLogicZ>; // Floating point and/or/xor logicals (ZMM). 977defm : Zn3WriteResXMMPair<WriteFTest, [Zn3FPFMisc12], 1, [2], 2>; // FIXME: latency not from llvm-exegesis // Floating point TEST instructions. 978defm : Zn3WriteResYMMPair<WriteFTestY, [Zn3FPFMisc12], 1, [2], 2>; // FIXME: latency not from llvm-exegesis // Floating point TEST instructions (YMM). 979defm : X86WriteResPairUnsupported<WriteFTestZ>; // Floating point TEST instructions (ZMM). 980defm : Zn3WriteResXMMPair<WriteFShuffle, [Zn3FPVShuf01], 1, [1], 1>; // Floating point vector shuffles. 981defm : Zn3WriteResYMMPair<WriteFShuffleY, [Zn3FPVShuf01], 1, [1], 1>; // Floating point vector shuffles (YMM). 982defm : X86WriteResPairUnsupported<WriteFShuffleZ>; // Floating point vector shuffles (ZMM). 983defm : Zn3WriteResXMMPair<WriteFVarShuffle, [Zn3FPVShuf01], 3, [1], 1>; // Floating point vector variable shuffles. 984defm : Zn3WriteResYMMPair<WriteFVarShuffleY, [Zn3FPVShuf01], 3, [1], 1>; // Floating point vector variable shuffles (YMM). 985defm : X86WriteResPairUnsupported<WriteFVarShuffleZ>; // Floating point vector variable shuffles (ZMM). 986defm : Zn3WriteResXMMPair<WriteFBlend, [Zn3FPFMul01], 1, [1], 1>; // Floating point vector blends. 987defm : Zn3WriteResYMMPair<WriteFBlendY, [Zn3FPFMul01], 1, [1], 1>; // Floating point vector blends (YMM). 988defm : X86WriteResPairUnsupported<WriteFBlendZ>; // Floating point vector blends (ZMM). 989defm : Zn3WriteResXMMPair<WriteFVarBlend, [Zn3FPFMul01], 1, [1], 1>; // Fp vector variable blends. 990defm : Zn3WriteResYMMPair<WriteFVarBlendY, [Zn3FPFMul01], 1, [1], 1>; // Fp vector variable blends (YMM). 991defm : X86WriteResPairUnsupported<WriteFVarBlendZ>; // Fp vector variable blends (ZMM). 992 993// Horizontal Add/Sub (float and integer) 994defm : Zn3WriteResXMMPair<WriteFHAdd, [Zn3FPFAdd0], 6, [2], 4>; 995defm : Zn3WriteResYMMPair<WriteFHAddY, [Zn3FPFAdd0], 6, [2], 3, /*LoadUOps=*/1>; 996defm : X86WriteResPairUnsupported<WriteFHAddZ>; 997defm : Zn3WriteResXMMPair<WritePHAdd, [Zn3FPVAdd0], 2, [2], 3, /*LoadUOps=*/1>; 998defm : Zn3WriteResXMMPair<WritePHAddX, [Zn3FPVAdd0], 2, [2], 4>; 999defm : Zn3WriteResYMMPair<WritePHAddY, [Zn3FPVAdd0], 2, [2], 3, /*LoadUOps=*/1>; 1000defm : X86WriteResPairUnsupported<WritePHAddZ>; 1001 1002// Vector integer operations. 1003defm : Zn3WriteResXMM<WriteVecLoad, [Zn3FPLd01, Zn3Load], !add(Znver3Model.VecLoadLatency, 1), [1, 1], 1>; 1004defm : Zn3WriteResXMM<WriteVecLoadX, [Zn3FPLd01, Zn3Load], !add(Znver3Model.VecLoadLatency, 1), [1, 1], 1>; 1005defm : Zn3WriteResYMM<WriteVecLoadY, [Zn3FPLd01, Zn3Load], !add(Znver3Model.VecLoadLatency, 1), [1, 1], 1>; 1006defm : Zn3WriteResXMM<WriteVecLoadNT, [Zn3FPLd01, Zn3Load], !add(Znver3Model.VecLoadLatency, 1), [1, 1], 1>; 1007defm : Zn3WriteResYMM<WriteVecLoadNTY, [Zn3FPLd01, Zn3Load], !add(Znver3Model.VecLoadLatency, 1), [1, 1], 1>; 1008defm : Zn3WriteResXMM<WriteVecMaskedLoad, [Zn3FPLd01, Zn3Load], !add(Znver3Model.VecLoadLatency, 1), [1, 1], 1>; 1009defm : Zn3WriteResYMM<WriteVecMaskedLoadY, [Zn3FPLd01, Zn3Load], !add(Znver3Model.VecLoadLatency, 1), [1, 1], 1>; 1010defm : Zn3WriteResXMM<WriteVecStore, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [1, 1], 1>; 1011defm : Zn3WriteResXMM<WriteVecStoreX, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [1, 1], 1>; 1012 1013def Zn3WriteVEXTRACTF128rr_VEXTRACTI128rr : SchedWriteRes<[Zn3FPFMisc0]> { 1014 let Latency = 4; 1015 let ResourceCycles = [1]; 1016 let NumMicroOps = 1; 1017} 1018def : InstRW<[Zn3WriteVEXTRACTF128rr_VEXTRACTI128rr], (instrs VEXTRACTF128rr, VEXTRACTI128rr)>; 1019 1020def Zn3WriteVEXTRACTI128mr : SchedWriteRes<[Zn3FPFMisc0, Zn3FPSt, Zn3Store]> { 1021 let Latency = !add(Znver3Model.LoadLatency, Zn3WriteVEXTRACTF128rr_VEXTRACTI128rr.Latency); 1022 let ResourceCycles = [1, 1, 1]; 1023 let NumMicroOps = !add(Zn3WriteVEXTRACTF128rr_VEXTRACTI128rr.NumMicroOps, 1); 1024} 1025def : InstRW<[Zn3WriteVEXTRACTI128mr], (instrs VEXTRACTI128mr, VEXTRACTF128mr)>; 1026 1027def Zn3WriteVINSERTF128rmr : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPFMisc0]> { 1028 let Latency = !add(Znver3Model.LoadLatency, Zn3WriteVEXTRACTF128rr_VEXTRACTI128rr.Latency); 1029 let ResourceCycles = [1, 1, 1]; 1030 let NumMicroOps = !add(Zn3WriteVEXTRACTF128rr_VEXTRACTI128rr.NumMicroOps, 0); 1031} 1032def : InstRW<[Zn3WriteVINSERTF128rmr], (instrs VINSERTF128rm)>; 1033 1034defm : Zn3WriteResYMM<WriteVecStoreY, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [1, 1], 1>; 1035defm : Zn3WriteResXMM<WriteVecStoreNT, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [1, 1], 1>; 1036defm : Zn3WriteResYMM<WriteVecStoreNTY, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [1, 1], 1>; 1037defm : Zn3WriteResXMM<WriteVecMaskedStore32, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [6, 1], 18>; 1038defm : Zn3WriteResXMM<WriteVecMaskedStore64, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [4, 1], 10>; 1039defm : Zn3WriteResYMM<WriteVecMaskedStore32Y, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [12, 1], 42>; 1040defm : Zn3WriteResYMM<WriteVecMaskedStore64Y, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [6, 1], 18>; 1041 1042defm : Zn3WriteResXMM<WriteVecMoveToGpr, [Zn3FPLd01], 1, [2], 1>; 1043defm : Zn3WriteResXMM<WriteVecMoveFromGpr, [Zn3FPLd01], 1, [2], 1>; 1044 1045def Zn3WriteMOVMMX : SchedWriteRes<[Zn3FPLd01, Zn3FPFMisc0123]> { 1046 let Latency = 1; 1047 let ResourceCycles = [1, 2]; 1048 let NumMicroOps = 2; 1049} 1050def : InstRW<[Zn3WriteMOVMMX], (instrs MMX_MOVQ2FR64rr, MMX_MOVQ2DQrr)>; 1051 1052def Zn3WriteMOVMMXSlow : SchedWriteRes<[Zn3FPLd01, Zn3FPFMisc0123]> { 1053 let Latency = 1; 1054 let ResourceCycles = [1, 4]; 1055 let NumMicroOps = 2; 1056} 1057def : InstRW<[Zn3WriteMOVMMXSlow], (instrs MMX_MOVD64rr, MMX_MOVD64to64rr)>; 1058 1059defm : Zn3WriteResXMMPair<WriteVecALU, [Zn3FPVAdd0123], 1, [1], 1>; // Vector integer ALU op, no logicals. 1060 1061def Zn3WriteEXTRQ_INSERTQ : SchedWriteRes<[Zn3FPVShuf01, Zn3FPLd01]> { 1062 let Latency = 3; 1063 let ResourceCycles = [1, 1]; 1064 let NumMicroOps = 1; 1065} 1066def : InstRW<[Zn3WriteEXTRQ_INSERTQ], (instrs EXTRQ, INSERTQ)>; 1067 1068def Zn3WriteEXTRQI_INSERTQI : SchedWriteRes<[Zn3FPVShuf01, Zn3FPLd01]> { 1069 let Latency = 3; 1070 let ResourceCycles = [1, 1]; 1071 let NumMicroOps = 2; 1072} 1073def : InstRW<[Zn3WriteEXTRQI_INSERTQI], (instrs EXTRQI, INSERTQI)>; 1074 1075defm : Zn3WriteResXMMPair<WriteVecALUX, [Zn3FPVAdd0123], 1, [1], 1>; // Vector integer ALU op, no logicals (XMM). 1076 1077def Zn3WriteVecALUXSlow : SchedWriteRes<[Zn3FPVAdd01]> { 1078 let Latency = 1; 1079 let ResourceCycles = [1]; 1080 let NumMicroOps = 1; 1081} 1082def : InstRW<[Zn3WriteVecALUXSlow], (instrs PABSBrr, PABSDrr, PABSWrr, 1083 PADDSBrr, PADDSWrr, PADDUSBrr, PADDUSWrr, 1084 PAVGBrr, PAVGWrr, 1085 PSIGNBrr, PSIGNDrr, PSIGNWrr, 1086 VPABSBrr, VPABSDrr, VPABSWrr, 1087 VPADDSBrr, VPADDSWrr, VPADDUSBrr, VPADDUSWrr, 1088 VPAVGBrr, VPAVGWrr, 1089 VPCMPEQQrr, 1090 VPSIGNBrr, VPSIGNDrr, VPSIGNWrr, 1091 PSUBSBrr, PSUBSWrr, PSUBUSBrr, PSUBUSWrr, VPSUBSBrr, VPSUBSWrr, VPSUBUSBrr, VPSUBUSWrr)>; 1092 1093def Zn3WriteVecALUXMMX : SchedWriteRes<[Zn3FPVAdd01]> { 1094 let Latency = 1; 1095 let ResourceCycles = [1]; 1096 let NumMicroOps = 1; 1097} 1098def : InstRW<[Zn3WriteVecALUXMMX], (instrs MMX_PABSBrr, MMX_PABSDrr, MMX_PABSWrr, 1099 MMX_PSIGNBrr, MMX_PSIGNDrr, MMX_PSIGNWrr, 1100 MMX_PADDSBirr, MMX_PADDSWirr, MMX_PADDUSBirr, MMX_PADDUSWirr, 1101 MMX_PAVGBirr, MMX_PAVGWirr, 1102 MMX_PSUBSBirr, MMX_PSUBSWirr, MMX_PSUBUSBirr, MMX_PSUBUSWirr)>; 1103 1104defm : Zn3WriteResYMMPair<WriteVecALUY, [Zn3FPVAdd0123], 1, [1], 1>; // Vector integer ALU op, no logicals (YMM). 1105 1106def Zn3WriteVecALUYSlow : SchedWriteRes<[Zn3FPVAdd01]> { 1107 let Latency = 1; 1108 let ResourceCycles = [1]; 1109 let NumMicroOps = 1; 1110} 1111def : InstRW<[Zn3WriteVecALUYSlow], (instrs VPABSBYrr, VPABSDYrr, VPABSWYrr, 1112 VPADDSBYrr, VPADDSWYrr, VPADDUSBYrr, VPADDUSWYrr, 1113 VPSUBSBYrr, VPSUBSWYrr, VPSUBUSBYrr, VPSUBUSWYrr, 1114 VPAVGBYrr, VPAVGWYrr, 1115 VPCMPEQQYrr, 1116 VPSIGNBYrr, VPSIGNDYrr, VPSIGNWYrr)>; 1117 1118defm : X86WriteResPairUnsupported<WriteVecALUZ>; // Vector integer ALU op, no logicals (ZMM). 1119defm : Zn3WriteResXMMPair<WriteVecLogic, [Zn3FPVMisc0123], 1, [1], 1>; // Vector integer and/or/xor logicals. 1120defm : Zn3WriteResXMMPair<WriteVecLogicX, [Zn3FPVMisc0123], 1, [1], 1>; // Vector integer and/or/xor logicals (XMM). 1121defm : Zn3WriteResYMMPair<WriteVecLogicY, [Zn3FPVMisc0123], 1, [1], 1>; // Vector integer and/or/xor logicals (YMM). 1122defm : X86WriteResPairUnsupported<WriteVecLogicZ>; // Vector integer and/or/xor logicals (ZMM). 1123defm : Zn3WriteResXMMPair<WriteVecTest, [Zn3FPVAdd12, Zn3FPSt], 1, [1, 1], 2>; // FIXME: latency not from llvm-exegesis // Vector integer TEST instructions. 1124defm : Zn3WriteResYMMPair<WriteVecTestY, [Zn3FPVAdd12, Zn3FPSt], 1, [1, 1], 2>; // FIXME: latency not from llvm-exegesis // Vector integer TEST instructions (YMM). 1125defm : X86WriteResPairUnsupported<WriteVecTestZ>; // Vector integer TEST instructions (ZMM). 1126defm : Zn3WriteResXMMPair<WriteVecShift, [Zn3FPVShift01], 1, [1], 1>; // Vector integer shifts (default). 1127defm : Zn3WriteResXMMPair<WriteVecShiftX, [Zn3FPVShift01], 1, [1], 1>; // Vector integer shifts (XMM). 1128defm : Zn3WriteResYMMPair<WriteVecShiftY, [Zn3FPVShift01], 1, [1], 1>; // Vector integer shifts (YMM). 1129defm : X86WriteResPairUnsupported<WriteVecShiftZ>; // Vector integer shifts (ZMM). 1130defm : Zn3WriteResXMMPair<WriteVecShiftImm, [Zn3FPVShift01], 1, [1], 1>; // Vector integer immediate shifts (default). 1131defm : Zn3WriteResXMMPair<WriteVecShiftImmX, [Zn3FPVShift01], 1, [1], 1>; // Vector integer immediate shifts (XMM). 1132defm : Zn3WriteResYMMPair<WriteVecShiftImmY, [Zn3FPVShift01], 1, [1], 1>; // Vector integer immediate shifts (YMM). 1133defm : X86WriteResPairUnsupported<WriteVecShiftImmZ>; // Vector integer immediate shifts (ZMM). 1134defm : Zn3WriteResXMMPair<WriteVecIMul, [Zn3FPVMul01], 3, [1], 1>; // Vector integer multiply (default). 1135defm : Zn3WriteResXMMPair<WriteVecIMulX, [Zn3FPVMul01], 3, [1], 1>; // Vector integer multiply (XMM). 1136defm : Zn3WriteResYMMPair<WriteVecIMulY, [Zn3FPVMul01], 3, [1], 1>; // Vector integer multiply (YMM). 1137defm : X86WriteResPairUnsupported<WriteVecIMulZ>; // Vector integer multiply (ZMM). 1138defm : Zn3WriteResXMMPair<WritePMULLD, [Zn3FPVMul01], 3, [1], 1>; // Vector PMULLD. 1139defm : Zn3WriteResYMMPair<WritePMULLDY, [Zn3FPVMul01], 3, [1], 1>; // Vector PMULLD (YMM). 1140defm : X86WriteResPairUnsupported<WritePMULLDZ>; // Vector PMULLD (ZMM). 1141defm : Zn3WriteResXMMPair<WriteShuffle, [Zn3FPVShuf01], 1, [1], 1>; // Vector shuffles. 1142defm : Zn3WriteResXMMPair<WriteShuffleX, [Zn3FPVShuf01], 1, [1], 1>; // Vector shuffles (XMM). 1143defm : Zn3WriteResYMMPair<WriteShuffleY, [Zn3FPVShuf01], 1, [1], 1>; // Vector shuffles (YMM). 1144defm : X86WriteResPairUnsupported<WriteShuffleZ>; // Vector shuffles (ZMM). 1145defm : Zn3WriteResXMMPair<WriteVarShuffle, [Zn3FPVShift01], 1, [1], 1>; // Vector variable shuffles. 1146defm : Zn3WriteResXMMPair<WriteVarShuffleX, [Zn3FPVShift01], 1, [1], 1>; // Vector variable shuffles (XMM). 1147defm : Zn3WriteResYMMPair<WriteVarShuffleY, [Zn3FPVShift01], 1, [1], 1>; // Vector variable shuffles (YMM). 1148defm : X86WriteResPairUnsupported<WriteVarShuffleZ>; // Vector variable shuffles (ZMM). 1149defm : Zn3WriteResXMMPair<WriteBlend, [Zn3FPVMisc0123], 1, [1], 1>; // Vector blends. 1150defm : Zn3WriteResYMMPair<WriteBlendY, [Zn3FPVMisc0123], 1, [1], 1>; // Vector blends (YMM). 1151defm : X86WriteResPairUnsupported<WriteBlendZ>; // Vector blends (ZMM). 1152defm : Zn3WriteResXMMPair<WriteVarBlend, [Zn3FPVMul01], 1, [1], 1>; // Vector variable blends. 1153defm : Zn3WriteResYMMPair<WriteVarBlendY, [Zn3FPVMul01], 1, [1], 1>; // Vector variable blends (YMM). 1154defm : X86WriteResPairUnsupported<WriteVarBlendZ>; // Vector variable blends (ZMM). 1155defm : Zn3WriteResXMMPair<WritePSADBW, [Zn3FPVAdd0123], 3, [2], 1>; // Vector PSADBW. 1156defm : Zn3WriteResXMMPair<WritePSADBWX, [Zn3FPVAdd0123], 3, [2], 1>; // Vector PSADBW (XMM). 1157defm : Zn3WriteResYMMPair<WritePSADBWY, [Zn3FPVAdd0123], 3, [2], 1>; // Vector PSADBW (YMM). 1158defm : X86WriteResPairUnsupported<WritePSADBWZ>; // Vector PSADBW (ZMM). 1159defm : Zn3WriteResXMMPair<WriteMPSAD, [Zn3FPVAdd0123], 4, [8], 4, /*LoadUOps=*/2>; // Vector MPSAD. 1160defm : Zn3WriteResYMMPair<WriteMPSADY, [Zn3FPVAdd0123], 4, [8], 3, /*LoadUOps=*/1>; // Vector MPSAD (YMM). 1161defm : X86WriteResPairUnsupported<WriteMPSADZ>; // Vector MPSAD (ZMM). 1162defm : Zn3WriteResXMMPair<WritePHMINPOS, [Zn3FPVAdd01], 3, [1], 1>; // Vector PHMINPOS. 1163 1164// Vector insert/extract operations. 1165defm : Zn3WriteResXMMPair<WriteVecInsert, [Zn3FPLd01], 1, [2], 2, /*LoadUOps=*/-1>; // Insert gpr to vector element. 1166defm : Zn3WriteResXMM<WriteVecExtract, [Zn3FPLd01], 1, [2], 2>; // Extract vector element to gpr. 1167defm : Zn3WriteResXMM<WriteVecExtractSt, [Zn3FPSt, Zn3Store], !add(1, Znver3Model.StoreLatency), [1, 1], 2>; // Extract vector element and store. 1168 1169// MOVMSK operations. 1170defm : Zn3WriteResXMM<WriteFMOVMSK, [Zn3FPVMisc2], 1, [1], 1>; 1171defm : Zn3WriteResXMM<WriteVecMOVMSK, [Zn3FPVMisc2], 1, [1], 1>; 1172defm : Zn3WriteResYMM<WriteVecMOVMSKY, [Zn3FPVMisc2], 1, [1], 1>; 1173defm : Zn3WriteResXMM<WriteMMXMOVMSK, [Zn3FPVMisc2], 1, [1], 1>; 1174 1175// Conversion between integer and float. 1176defm : Zn3WriteResXMMPair<WriteCvtSD2I, [Zn3FPFCvt01], 2, [2], 2>; // Double -> Integer. 1177defm : Zn3WriteResXMMPair<WriteCvtPD2I, [Zn3FPFCvt01], 3, [1], 1>; // Double -> Integer (XMM). 1178defm : Zn3WriteResYMMPair<WriteCvtPD2IY, [Zn3FPFCvt01], 6, [2], 2>; // Double -> Integer (YMM). 1179defm : X86WriteResPairUnsupported<WriteCvtPD2IZ>; // Double -> Integer (ZMM). 1180 1181def Zn3WriteCvtPD2IMMX : SchedWriteRes<[Zn3FPFCvt01]> { 1182 let Latency = 1; 1183 let ResourceCycles = [2]; 1184 let NumMicroOps = 2; 1185} 1186def : InstRW<[Zn3WriteCvtPD2IMMX], (instrs MMX_CVTPD2PIirm, MMX_CVTTPD2PIirm, MMX_CVTPD2PIirr, MMX_CVTTPD2PIirr)>; 1187 1188defm : Zn3WriteResXMMPair<WriteCvtSS2I, [Zn3FPFCvt01], 2, [2], 2>; // Float -> Integer. 1189 1190defm : Zn3WriteResXMMPair<WriteCvtPS2I, [Zn3FPFCvt01], 3, [1], 1>; // Float -> Integer (XMM). 1191defm : Zn3WriteResYMMPair<WriteCvtPS2IY, [Zn3FPFCvt01], 3, [1], 1>; // Float -> Integer (YMM). 1192defm : X86WriteResPairUnsupported<WriteCvtPS2IZ>; // Float -> Integer (ZMM). 1193 1194defm : Zn3WriteResXMMPair<WriteCvtI2SD, [Zn3FPFCvt01], 3, [2], 2, /*LoadUOps=*/-1>; // Integer -> Double. 1195defm : Zn3WriteResXMMPair<WriteCvtI2PD, [Zn3FPFCvt01], 3, [1], 1>; // Integer -> Double (XMM). 1196defm : Zn3WriteResYMMPair<WriteCvtI2PDY, [Zn3FPFCvt01], 4, [2], 2, /*LoadUOps=*/-1>; // Integer -> Double (YMM). 1197defm : X86WriteResPairUnsupported<WriteCvtI2PDZ>; // Integer -> Double (ZMM). 1198 1199def Zn3WriteCvtI2PDMMX : SchedWriteRes<[Zn3FPFCvt01]> { 1200 let Latency = 2; 1201 let ResourceCycles = [6]; 1202 let NumMicroOps = 2; 1203} 1204def : InstRW<[Zn3WriteCvtI2PDMMX], (instrs MMX_CVTPI2PDirm, MMX_CVTPI2PDirr)>; 1205 1206defm : Zn3WriteResXMMPair<WriteCvtI2SS, [Zn3FPFCvt01], 3, [2], 2, /*LoadUOps=*/-1>; // Integer -> Float. 1207defm : Zn3WriteResXMMPair<WriteCvtI2PS, [Zn3FPFCvt01], 3, [1], 1>; // Integer -> Float (XMM). 1208defm : Zn3WriteResYMMPair<WriteCvtI2PSY, [Zn3FPFCvt01], 3, [1], 1>; // Integer -> Float (YMM). 1209defm : X86WriteResPairUnsupported<WriteCvtI2PSZ>; // Integer -> Float (ZMM). 1210 1211def Zn3WriteCvtI2PSMMX : SchedWriteRes<[Zn3FPFCvt01]> { 1212 let Latency = 3; 1213 let ResourceCycles = [1]; 1214 let NumMicroOps = 2; 1215} 1216def : InstRW<[Zn3WriteCvtI2PSMMX], (instrs MMX_CVTPI2PSirr)>; 1217 1218defm : Zn3WriteResXMMPair<WriteCvtSS2SD, [Zn3FPFCvt01], 3, [1], 1>; // Float -> Double size conversion. 1219defm : Zn3WriteResXMMPair<WriteCvtPS2PD, [Zn3FPFCvt01], 3, [1], 1>; // Float -> Double size conversion (XMM). 1220defm : Zn3WriteResYMMPair<WriteCvtPS2PDY, [Zn3FPFCvt01], 4, [2], 2, /*LoadUOps=*/-1>; // Float -> Double size conversion (YMM). 1221defm : X86WriteResPairUnsupported<WriteCvtPS2PDZ>; // Float -> Double size conversion (ZMM). 1222 1223defm : Zn3WriteResXMMPair<WriteCvtSD2SS, [Zn3FPFCvt01], 3, [1], 1>; // Double -> Float size conversion. 1224defm : Zn3WriteResXMMPair<WriteCvtPD2PS, [Zn3FPFCvt01], 3, [1], 1>; // Double -> Float size conversion (XMM). 1225defm : Zn3WriteResYMMPair<WriteCvtPD2PSY, [Zn3FPFCvt01], 6, [2], 2>; // Double -> Float size conversion (YMM). 1226defm : X86WriteResPairUnsupported<WriteCvtPD2PSZ>; // Double -> Float size conversion (ZMM). 1227 1228defm : Zn3WriteResXMMPair<WriteCvtPH2PS, [Zn3FPFCvt01], 3, [1], 1>; // Half -> Float size conversion. 1229defm : Zn3WriteResYMMPair<WriteCvtPH2PSY, [Zn3FPFCvt01], 4, [2], 2, /*LoadUOps=*/-1>; // Half -> Float size conversion (YMM). 1230defm : X86WriteResPairUnsupported<WriteCvtPH2PSZ>; // Half -> Float size conversion (ZMM). 1231 1232defm : Zn3WriteResXMM<WriteCvtPS2PH, [Zn3FPFCvt01], 3, [2], 1>; // Float -> Half size conversion. 1233defm : Zn3WriteResYMM<WriteCvtPS2PHY, [Zn3FPFCvt01], 6, [2], 2>; // Float -> Half size conversion (YMM). 1234defm : X86WriteResUnsupported<WriteCvtPS2PHZ>; // Float -> Half size conversion (ZMM). 1235defm : Zn3WriteResXMM<WriteCvtPS2PHSt, [Zn3FPFCvt01, Zn3FPSt, Zn3Store], !add(3, Znver3Model.StoreLatency), [1, 1, 1], 2>; // Float -> Half + store size conversion. 1236defm : Zn3WriteResYMM<WriteCvtPS2PHYSt, [Zn3FPFCvt01, Zn3FPSt, Zn3Store], !add(6, Znver3Model.StoreLatency), [2, 1, 1], 3>; // Float -> Half + store size conversion (YMM). 1237defm : X86WriteResUnsupported<WriteCvtPS2PHZSt>; // Float -> Half + store size conversion (ZMM). 1238 1239// CRC32 instruction. 1240defm : Zn3WriteResIntPair<WriteCRC32, [Zn3ALU1], 3, [1], 1>; 1241 1242def Zn3WriteSHA1MSG1rr : SchedWriteRes<[Zn3FPU0123]> { 1243 let Latency = 2; 1244 let ResourceCycles = [2]; 1245 let NumMicroOps = 2; 1246} 1247def : InstRW<[Zn3WriteSHA1MSG1rr], (instrs SHA1MSG1rr)>; 1248 1249def Zn3WriteSHA1MSG1rm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPU0123]> { 1250 let Latency = !add(Znver3Model.LoadLatency, Zn3WriteSHA1MSG1rr.Latency); 1251 let ResourceCycles = [1, 1, 2]; 1252 let NumMicroOps = !add(Zn3WriteSHA1MSG1rr.NumMicroOps, 0); 1253} 1254def : InstRW<[Zn3WriteSHA1MSG1rm], (instrs SHA1MSG1rm)>; 1255 1256def Zn3WriteSHA1MSG2rr_SHA1NEXTErr : SchedWriteRes<[Zn3FPU0123]> { 1257 let Latency = 1; 1258 let ResourceCycles = [2]; 1259 let NumMicroOps = 1; 1260} 1261def : InstRW<[Zn3WriteSHA1MSG2rr_SHA1NEXTErr], (instrs SHA1MSG2rr, SHA1NEXTErr)>; 1262 1263def Zn3Writerm_SHA1MSG2rm_SHA1NEXTErm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPU0123]> { 1264 let Latency = !add(Znver3Model.LoadLatency, Zn3WriteSHA1MSG2rr_SHA1NEXTErr.Latency); 1265 let ResourceCycles = [1, 1, 2]; 1266 let NumMicroOps = !add(Zn3WriteSHA1MSG2rr_SHA1NEXTErr.NumMicroOps, 0); 1267} 1268def : InstRW<[Zn3Writerm_SHA1MSG2rm_SHA1NEXTErm], (instrs SHA1MSG2rm, SHA1NEXTErm)>; 1269 1270def Zn3WriteSHA256MSG1rr : SchedWriteRes<[Zn3FPU0123]> { 1271 let Latency = 2; 1272 let ResourceCycles = [3]; 1273 let NumMicroOps = 2; 1274} 1275def : InstRW<[Zn3WriteSHA256MSG1rr], (instrs SHA256MSG1rr)>; 1276 1277def Zn3Writerm_SHA256MSG1rm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPU0123]> { 1278 let Latency = !add(Znver3Model.LoadLatency, Zn3WriteSHA256MSG1rr.Latency); 1279 let ResourceCycles = [1, 1, 3]; 1280 let NumMicroOps = !add(Zn3WriteSHA256MSG1rr.NumMicroOps, 0); 1281} 1282def : InstRW<[Zn3Writerm_SHA256MSG1rm], (instrs SHA256MSG1rm)>; 1283 1284def Zn3WriteSHA256MSG2rr : SchedWriteRes<[Zn3FPU0123]> { 1285 let Latency = 3; 1286 let ResourceCycles = [8]; 1287 let NumMicroOps = 4; 1288} 1289def : InstRW<[Zn3WriteSHA256MSG2rr], (instrs SHA256MSG2rr)>; 1290 1291def Zn3WriteSHA256MSG2rm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPU0123]> { 1292 let Latency = !add(Znver3Model.LoadLatency, Zn3WriteSHA256MSG2rr.Latency); 1293 let ResourceCycles = [1, 1, 8]; 1294 let NumMicroOps = !add(Zn3WriteSHA256MSG2rr.NumMicroOps, 1); 1295} 1296def : InstRW<[Zn3WriteSHA256MSG2rm], (instrs SHA256MSG2rm)>; 1297 1298def Zn3WriteSHA1RNDS4rri : SchedWriteRes<[Zn3FPU0123]> { 1299 let Latency = 6; 1300 let ResourceCycles = [8]; 1301 let NumMicroOps = 1; 1302} 1303def : InstRW<[Zn3WriteSHA1RNDS4rri], (instrs SHA1RNDS4rri)>; 1304 1305def Zn3WriteSHA256RNDS2rr : SchedWriteRes<[Zn3FPU0123]> { 1306 let Latency = 4; 1307 let ResourceCycles = [8]; 1308 let NumMicroOps = 1; 1309} 1310def : InstRW<[Zn3WriteSHA256RNDS2rr], (instrs SHA256RNDS2rr)>; 1311 1312// Strings instructions. 1313// Packed Compare Implicit Length Strings, Return Mask 1314defm : Zn3WriteResXMMPair<WritePCmpIStrM, [Zn3FPVAdd0123], 6, [8], 3, /*LoadUOps=*/1>; 1315// Packed Compare Explicit Length Strings, Return Mask 1316defm : Zn3WriteResXMMPair<WritePCmpEStrM, [Zn3FPVAdd0123], 6, [12], 7, /*LoadUOps=*/5>; 1317// Packed Compare Implicit Length Strings, Return Index 1318defm : Zn3WriteResXMMPair<WritePCmpIStrI, [Zn3FPVAdd0123], 2, [8], 4>; 1319// Packed Compare Explicit Length Strings, Return Index 1320defm : Zn3WriteResXMMPair<WritePCmpEStrI, [Zn3FPVAdd0123], 6, [12], 8, /*LoadUOps=*/4>; 1321 1322// AES instructions. 1323defm : Zn3WriteResXMMPair<WriteAESDecEnc, [Zn3FPAES01], 4, [1], 1>; // Decryption, encryption. 1324defm : Zn3WriteResXMMPair<WriteAESIMC, [Zn3FPAES01], 4, [1], 1>; // InvMixColumn. 1325defm : Zn3WriteResXMMPair<WriteAESKeyGen, [Zn3FPAES01], 4, [1], 1>; // Key Generation. 1326 1327// Carry-less multiplication instructions. 1328defm : Zn3WriteResXMMPair<WriteCLMul, [Zn3FPCLM01], 4, [4], 4>; 1329 1330// EMMS/FEMMS 1331defm : Zn3WriteResInt<WriteEMMS, [Zn3ALU0123], 2, [1], 1>; // FIXME: latency not from llvm-exegesis 1332 1333// Load/store MXCSR 1334defm : Zn3WriteResInt<WriteLDMXCSR, [Zn3AGU012, Zn3Load, Zn3ALU0123], !add(Znver3Model.LoadLatency, 1), [1, 1, 6], 1>; // FIXME: latency not from llvm-exegesis 1335defm : Zn3WriteResInt<WriteSTMXCSR, [Zn3ALU0123, Zn3AGU012, Zn3Store], !add(1, Znver3Model.StoreLatency), [60, 1, 1], 2>; // FIXME: latency not from llvm-exegesis 1336 1337// Catch-all for expensive system instructions. 1338defm : Zn3WriteResInt<WriteSystem, [Zn3ALU0123], 100, [100], 100>; 1339 1340def Zn3WriteVZEROUPPER : SchedWriteRes<[Zn3FPU0123]> { 1341 let Latency = 0; // FIXME: not from llvm-exegesis 1342 let ResourceCycles = [1]; 1343 let NumMicroOps = 1; 1344} 1345def : InstRW<[Zn3WriteVZEROUPPER], (instrs VZEROUPPER)>; 1346 1347def Zn3WriteVZEROALL : SchedWriteRes<[Zn3FPU0123]> { 1348 let Latency = 10; // FIXME: not from llvm-exegesis 1349 let ResourceCycles = [24]; 1350 let NumMicroOps = 18; 1351} 1352def : InstRW<[Zn3WriteVZEROALL], (instrs VZEROALL)>; 1353 1354// AVX2. 1355defm : Zn3WriteResYMMPair<WriteFShuffle256, [Zn3FPVShuf], 2, [1], 1, /*LoadUOps=*/2>; // Fp 256-bit width vector shuffles. 1356defm : Zn3WriteResYMMPair<WriteFVarShuffle256, [Zn3FPVShuf], 7, [1], 2, /*LoadUOps=*/1>; // Fp 256-bit width variable shuffles. 1357defm : Zn3WriteResYMMPair<WriteShuffle256, [Zn3FPVShuf], 2, [1], 1>; // 256-bit width vector shuffles. 1358 1359def Zn3WriteVPERM2I128rr_VPERM2F128rr : SchedWriteRes<[Zn3FPVShuf]> { 1360 let Latency = 3; 1361 let ResourceCycles = [1]; 1362 let NumMicroOps = 1; 1363} 1364def : InstRW<[Zn3WriteVPERM2I128rr_VPERM2F128rr], (instrs VPERM2I128rr, VPERM2F128rr)>; 1365 1366def Zn3WriteVPERM2F128rm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPVShuf]> { 1367 let Latency = !add(Znver3Model.LoadLatency, Zn3WriteVPERM2I128rr_VPERM2F128rr.Latency); 1368 let ResourceCycles = [1, 1, 1]; 1369 let NumMicroOps = !add(Zn3WriteVPERM2I128rr_VPERM2F128rr.NumMicroOps, 0); 1370} 1371def : InstRW<[Zn3WriteVPERM2F128rm], (instrs VPERM2F128rm)>; 1372 1373def Zn3WriteVPERMPSYrr : SchedWriteRes<[Zn3FPVShuf]> { 1374 let Latency = 7; 1375 let ResourceCycles = [1]; 1376 let NumMicroOps = 2; 1377} 1378def : InstRW<[Zn3WriteVPERMPSYrr], (instrs VPERMPSYrr)>; 1379 1380def Zn3WriteVPERMPSYrm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPVShuf]> { 1381 let Latency = !add(Znver3Model.LoadLatency, Zn3WriteVPERMPSYrr.Latency); 1382 let ResourceCycles = [1, 1, 2]; 1383 let NumMicroOps = !add(Zn3WriteVPERMPSYrr.NumMicroOps, 1); 1384} 1385def : InstRW<[Zn3WriteVPERMPSYrm], (instrs VPERMPSYrm)>; 1386 1387def Zn3WriteVPERMYri : SchedWriteRes<[Zn3FPVShuf]> { 1388 let Latency = 6; 1389 let ResourceCycles = [1]; 1390 let NumMicroOps = 2; 1391} 1392def : InstRW<[Zn3WriteVPERMYri], (instrs VPERMPDYri, VPERMQYri)>; 1393 1394def Zn3WriteVPERMPDYmi : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPVShuf]> { 1395 let Latency = !add(Znver3Model.LoadLatency, Zn3WriteVPERMYri.Latency); 1396 let ResourceCycles = [1, 1, 2]; 1397 let NumMicroOps = !add(Zn3WriteVPERMYri.NumMicroOps, 1); 1398} 1399def : InstRW<[Zn3WriteVPERMPDYmi], (instrs VPERMPDYmi)>; 1400 1401def Zn3WriteVPERMDYrr : SchedWriteRes<[Zn3FPVShuf]> { 1402 let Latency = 5; 1403 let ResourceCycles = [1]; 1404 let NumMicroOps = 2; 1405} 1406def : InstRW<[Zn3WriteVPERMDYrr], (instrs VPERMDYrr)>; 1407 1408def Zn3WriteVPERMYm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPVShuf]> { 1409 let Latency = !add(Znver3Model.LoadLatency, Zn3WriteVPERMDYrr.Latency); 1410 let ResourceCycles = [1, 1, 2]; 1411 let NumMicroOps = !add(Zn3WriteVPERMDYrr.NumMicroOps, 0); 1412} 1413def : InstRW<[Zn3WriteVPERMYm], (instrs VPERMQYmi, VPERMDYrm)>; 1414 1415defm : Zn3WriteResYMMPair<WriteVPMOV256, [Zn3FPVShuf01], 4, [3], 2, /*LoadUOps=*/-1>; // 256-bit width packed vector width-changing move. 1416defm : Zn3WriteResYMMPair<WriteVarShuffle256, [Zn3FPVShift01], 1, [1], 2>; // 256-bit width vector variable shuffles. 1417defm : Zn3WriteResXMMPair<WriteVarVecShift, [Zn3FPVShift01], 1, [1], 1>; // Variable vector shifts. 1418defm : Zn3WriteResYMMPair<WriteVarVecShiftY, [Zn3FPVShift01], 1, [1], 1>; // Variable vector shifts (YMM). 1419defm : X86WriteResPairUnsupported<WriteVarVecShiftZ>; // Variable vector shifts (ZMM). 1420 1421// Old microcoded instructions that nobody use. 1422defm : Zn3WriteResInt<WriteMicrocoded, [Zn3ALU0123], 100, [100], 100>; 1423 1424// Fence instructions. 1425defm : Zn3WriteResInt<WriteFence, [Zn3ALU0123], 1, [100], 1>; 1426 1427def Zn3WriteLFENCE : SchedWriteRes<[Zn3LSU]> { 1428 let Latency = 1; 1429 let ResourceCycles = [30]; 1430 let NumMicroOps = 1; 1431} 1432def : InstRW<[Zn3WriteLFENCE], (instrs LFENCE)>; 1433 1434def Zn3WriteSFENCE : SchedWriteRes<[Zn3LSU]> { 1435 let Latency = 1; 1436 let ResourceCycles = [1]; 1437 let NumMicroOps = 1; 1438} 1439def : InstRW<[Zn3WriteSFENCE], (instrs SFENCE)>; 1440 1441// Nop, not very useful expect it provides a model for nops! 1442defm : Zn3WriteResInt<WriteNop, [Zn3ALU0123], 0, [1], 1>; // FIXME: latency not from llvm-exegesis 1443 1444 1445/////////////////////////////////////////////////////////////////////////////// 1446// Zero Cycle Move 1447/////////////////////////////////////////////////////////////////////////////// 1448 1449def Zn3WriteZeroLatency : SchedWriteRes<[]> { 1450 let Latency = 0; 1451 let ResourceCycles = []; 1452 let NumMicroOps = 1; 1453} 1454def : InstRW<[Zn3WriteZeroLatency], (instrs MOV32rr, MOV32rr_REV, 1455 MOV64rr, MOV64rr_REV, 1456 MOVSX32rr32)>; 1457 1458def Zn3WriteSwapRenameable : SchedWriteRes<[]> { 1459 let Latency = 0; 1460 let ResourceCycles = []; 1461 let NumMicroOps = 2; 1462} 1463def : InstRW<[Zn3WriteSwapRenameable], (instrs XCHG32rr, XCHG32ar, 1464 XCHG64rr, XCHG64ar)>; 1465 1466defm : Zn3WriteResInt<WriteXCHG, [Zn3ALU0123], 0, [8], 2>; // Compare+Exchange - TODO RMW support. 1467 1468defm : Zn3WriteResXMM<WriteFMove, [Zn3FPVMisc0123], 1, [1], 1>; // Empty sched class 1469defm : Zn3WriteResXMM<WriteFMoveX, [], 0, [], 1>; 1470defm : Zn3WriteResYMM<WriteFMoveY, [], 0, [], 1>; 1471 1472defm : Zn3WriteResXMM<WriteVecMove, [Zn3FPFMisc0123], 1, [1], 1>; // MMX 1473defm : Zn3WriteResXMM<WriteVecMoveX, [], 0, [], 1>; 1474defm : Zn3WriteResYMM<WriteVecMoveY, [], 0, [], 1>; 1475 1476def : IsOptimizableRegisterMove<[ 1477 InstructionEquivalenceClass<[ 1478 // GPR variants. 1479 MOV32rr, MOV32rr_REV, 1480 MOV64rr, MOV64rr_REV, 1481 MOVSX32rr32, 1482 XCHG32rr, XCHG32ar, 1483 XCHG64rr, XCHG64ar, 1484 1485 // MMX variants. 1486 // MMX moves are *NOT* eliminated. 1487 1488 // SSE variants. 1489 MOVAPSrr, MOVAPSrr_REV, 1490 MOVUPSrr, MOVUPSrr_REV, 1491 MOVAPDrr, MOVAPDrr_REV, 1492 MOVUPDrr, MOVUPDrr_REV, 1493 MOVDQArr, MOVDQArr_REV, 1494 MOVDQUrr, MOVDQUrr_REV, 1495 1496 // AVX variants. 1497 VMOVAPSrr, VMOVAPSrr_REV, 1498 VMOVUPSrr, VMOVUPSrr_REV, 1499 VMOVAPDrr, VMOVAPDrr_REV, 1500 VMOVUPDrr, VMOVUPDrr_REV, 1501 VMOVDQArr, VMOVDQArr_REV, 1502 VMOVDQUrr, VMOVDQUrr_REV, 1503 1504 // AVX YMM variants. 1505 VMOVAPSYrr, VMOVAPSYrr_REV, 1506 VMOVUPSYrr, VMOVUPSYrr_REV, 1507 VMOVAPDYrr, VMOVAPDYrr_REV, 1508 VMOVUPDYrr, VMOVUPDYrr_REV, 1509 VMOVDQAYrr, VMOVDQAYrr_REV, 1510 VMOVDQUYrr, VMOVDQUYrr_REV, 1511 ], TruePred > 1512]>; 1513 1514/////////////////////////////////////////////////////////////////////////////// 1515// Dependency breaking instructions. 1516/////////////////////////////////////////////////////////////////////////////// 1517 1518def Zn3WriteZeroIdiom : SchedWriteVariant<[ 1519 SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn3WriteZeroLatency]>, 1520 SchedVar<NoSchedPred, [WriteALU]> 1521]>; 1522def : InstRW<[Zn3WriteZeroIdiom], (instrs XOR32rr, XOR32rr_REV, 1523 XOR64rr, XOR64rr_REV, 1524 SUB32rr, SUB32rr_REV, 1525 SUB64rr, SUB64rr_REV)>; 1526 1527def Zn3WriteZeroIdiomEFLAGS : SchedWriteVariant<[ 1528 SchedVar<MCSchedPredicate<CheckSameRegOperand<0, 1>>, [Zn3WriteZeroLatency]>, 1529 SchedVar<NoSchedPred, [WriteALU]> 1530]>; 1531def : InstRW<[Zn3WriteZeroIdiomEFLAGS], (instrs CMP8rr, CMP8rr_REV, 1532 CMP16rr, CMP16rr_REV, 1533 CMP32rr, CMP32rr_REV, 1534 CMP64rr, CMP64rr_REV)>; 1535 1536def Zn3WriteFZeroIdiom : SchedWriteVariant<[ 1537 SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn3WriteZeroLatency]>, 1538 SchedVar<NoSchedPred, [WriteFLogic]> 1539]>; 1540// NOTE: XORPSrr, XORPDrr are not zero-cycle! 1541def : InstRW<[Zn3WriteFZeroIdiom], (instrs VXORPSrr, VXORPDrr, 1542 VANDNPSrr, VANDNPDrr)>; 1543 1544def Zn3WriteFZeroIdiomY : SchedWriteVariant<[ 1545 SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn3WriteZeroLatency]>, 1546 SchedVar<NoSchedPred, [WriteFLogicY]> 1547]>; 1548def : InstRW<[Zn3WriteFZeroIdiomY], (instrs VXORPSYrr, VXORPDYrr, 1549 VANDNPSYrr, VANDNPDYrr)>; 1550 1551def Zn3WriteVZeroIdiomLogicX : SchedWriteVariant<[ 1552 SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn3WriteZeroLatency]>, 1553 SchedVar<NoSchedPred, [WriteVecLogicX]> 1554]>; 1555// NOTE: PXORrr,PANDNrr are not zero-cycle! 1556def : InstRW<[Zn3WriteVZeroIdiomLogicX], (instrs VPXORrr, VPANDNrr)>; 1557 1558def Zn3WriteVZeroIdiomLogicY : SchedWriteVariant<[ 1559 SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn3WriteZeroLatency]>, 1560 SchedVar<NoSchedPred, [WriteVecLogicY]> 1561]>; 1562def : InstRW<[Zn3WriteVZeroIdiomLogicY], (instrs VPXORYrr, VPANDNYrr)>; 1563 1564def Zn3WriteVZeroIdiomALUX : SchedWriteVariant<[ 1565 SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn3WriteZeroLatency]>, 1566 SchedVar<NoSchedPred, [WriteVecALUX]> 1567]>; 1568// NOTE: PSUBBrr, PSUBWrr, PSUBDrr, PSUBQrr, 1569// PCMPGTBrr, PCMPGTWrr, PCMPGTDrr, PCMPGTQrr are not zero-cycle! 1570def : InstRW<[Zn3WriteVZeroIdiomALUX], 1571 (instrs VPSUBBrr, VPSUBWrr, VPSUBDrr, VPSUBQrr, 1572 VPCMPGTBrr, VPCMPGTWrr, VPCMPGTDrr, VPCMPGTQrr)>; 1573 1574def Zn3WriteVZeroIdiomALUY : SchedWriteVariant<[ 1575 SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn3WriteZeroLatency]>, 1576 SchedVar<NoSchedPred, [WriteVecALUY]> 1577]>; 1578def : InstRW<[Zn3WriteVZeroIdiomALUY], 1579 (instrs VPSUBBYrr, VPSUBWYrr, VPSUBDYrr, VPSUBQYrr, 1580 VPCMPGTBYrr, VPCMPGTWYrr, VPCMPGTDYrr, VPCMPGTQYrr)>; 1581 1582def : IsZeroIdiomFunction<[ 1583 // GPR Zero-idioms. 1584 DepBreakingClass<[ XOR32rr, XOR32rr_REV, 1585 XOR64rr, XOR64rr_REV, 1586 SUB32rr, SUB32rr_REV, 1587 SUB64rr, SUB64rr_REV ], ZeroIdiomPredicate>, 1588 1589 // SSE XMM Zero-idioms. 1590 DepBreakingClass<[ 1591 // fp variants. 1592 XORPSrr, XORPDrr, 1593 ANDNPSrr, ANDNPDrr, 1594 1595 // int variants. 1596 PXORrr, 1597 PANDNrr, 1598 PSUBBrr, PSUBWrr, PSUBDrr, PSUBQrr, 1599 PSUBSBrr, PSUBSWrr, 1600 PSUBUSBrr, PSUBUSWrr, 1601 PCMPGTBrr, PCMPGTWrr, PCMPGTDrr, PCMPGTQrr 1602 ], ZeroIdiomPredicate>, 1603 1604 // AVX XMM Zero-idioms. 1605 DepBreakingClass<[ 1606 // fp variants. 1607 VXORPSrr, VXORPDrr, 1608 VANDNPSrr, VANDNPDrr, 1609 1610 // int variants. 1611 VPXORrr, 1612 VPANDNrr, 1613 VPSUBBrr, VPSUBWrr, VPSUBDrr, VPSUBQrr, 1614 VPSUBSBrr, VPSUBSWrr, 1615 VPSUBUSBrr, VPSUBUSWrr, 1616 VPCMPGTBrr, VPCMPGTWrr, VPCMPGTDrr, VPCMPGTQrr, 1617 ], ZeroIdiomPredicate>, 1618 1619 // AVX YMM Zero-idioms. 1620 DepBreakingClass<[ 1621 // fp variants. 1622 VXORPSYrr, VXORPDYrr, 1623 VANDNPSYrr, VANDNPDYrr, 1624 1625 // int variants. 1626 VPXORYrr, 1627 VPANDNYrr, 1628 VPSUBBYrr, VPSUBWYrr, VPSUBDYrr, VPSUBQYrr, 1629 VPSUBSBYrr, VPSUBSWYrr, 1630 VPSUBUSBYrr, VPSUBUSWYrr, 1631 VPCMPGTBYrr, VPCMPGTWYrr, VPCMPGTDYrr, VPCMPGTQYrr 1632 ], ZeroIdiomPredicate>, 1633]>; 1634 1635def : IsDepBreakingFunction<[ 1636 // GPR 1637 DepBreakingClass<[ SBB32rr, SBB32rr_REV, 1638 SBB64rr, SBB64rr_REV ], ZeroIdiomPredicate>, 1639 DepBreakingClass<[ CMP8rr, CMP8rr_REV, 1640 CMP16rr, CMP16rr_REV, 1641 CMP32rr, CMP32rr_REV, 1642 CMP64rr, CMP64rr_REV ], CheckSameRegOperand<0, 1> >, 1643 1644 // MMX 1645 DepBreakingClass<[ 1646 MMX_PCMPEQBirr, MMX_PCMPEQWirr, MMX_PCMPEQDirr 1647 ], ZeroIdiomPredicate>, 1648 1649 // SSE 1650 DepBreakingClass<[ 1651 PCMPEQBrr, PCMPEQWrr, PCMPEQDrr, PCMPEQQrr 1652 ], ZeroIdiomPredicate>, 1653 1654 // AVX XMM 1655 DepBreakingClass<[ 1656 VPCMPEQBrr, VPCMPEQWrr, VPCMPEQDrr, VPCMPEQQrr 1657 ], ZeroIdiomPredicate>, 1658 1659 // AVX YMM 1660 DepBreakingClass<[ 1661 VPCMPEQBYrr, VPCMPEQWYrr, VPCMPEQDYrr, VPCMPEQQYrr 1662 ], ZeroIdiomPredicate>, 1663]>; 1664 1665} // SchedModel 1666