1//=- X86ScheduleZnver3.td - X86 Znver3 Scheduling ------------*- tablegen -*-=//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines the machine model for Znver3 to support instruction
10// scheduling and other instruction cost heuristics.
11// Based on:
12//  * AMD Software Optimization Guide for AMD Family 19h Processors.
13//    https://www.amd.com/system/files/TechDocs/56665.zip
14//  * The microarchitecture of Intel, AMD and VIA CPUs, By Agner Fog
15//    http://www.agner.org/optimize/microarchitecture.pdf
16//  * AMD Zen 3 Ryzen Deep Dive Review
17//    https://www.anandtech.com/show/16214/
18//===----------------------------------------------------------------------===//
19
20def Znver3Model : SchedMachineModel {
21  // AMD SOG 19h, 2.9.6 Dispatch
22  // The processor may dispatch up to 6 macro ops per cycle
23  // into the execution engine.
24  let IssueWidth = 6;
25  // AMD SOG 19h, 2.10.3
26  // The retire control unit (RCU) tracks the completion status of all
27  // outstanding operations (integer, load/store, and floating-point) and is
28  // the final arbiter for exception processing and recovery.
29  // The unit can receive up to 6 macro ops dispatched per cycle and track up
30  // to 256 macro ops in-flight in non-SMT mode or 128 per thread in SMT mode.
31  let MicroOpBufferSize = 256;
32  // AMD SOG 19h, 2.9.1 Op Cache
33  // The op cache is organized as an associative cache with 64 sets and 8 ways.
34  // At each set-way intersection is an entry containing up to 8 macro ops.
35  // The maximum capacity of the op cache is 4K ops.
36  // Agner, 22.5 µop cache
37  // The size of the µop cache is big enough for holding most critical loops.
38  // FIXME: PR50384: IndVars has quadradic complexity, with large values here
39  //        the compilation of certain loops ends up taking way too long.
40  // let LoopMicroOpBufferSize = 4096;
41  let LoopMicroOpBufferSize = 256;
42  // AMD SOG 19h, 2.6.2 L1 Data Cache
43  // The L1 data cache has a 4- or 5- cycle integer load-to-use latency.
44  // AMD SOG 19h, 2.12 L1 Data Cache
45  // The AGU and LS pipelines are optimized for simple address generation modes.
46  // <...> and can achieve 4-cycle load-to-use integer load latency.
47  let LoadLatency = 4;
48  // AMD SOG 19h, 2.12 L1 Data Cache
49  // The AGU and LS pipelines are optimized for simple address generation modes.
50  // <...> and can achieve <...> 7-cycle load-to-use FP load latency.
51  int VecLoadLatency = 7;
52  // Latency of a simple store operation.
53  int StoreLatency = 1;
54  // FIXME
55  let HighLatency = 25; // FIXME: any better choice?
56  // AMD SOG 19h, 2.8 Optimizing Branching
57  // The branch misprediction penalty is in the range from 11 to 18 cycles,
58  // <...>. The common case penalty is 13 cycles.
59  let MispredictPenalty = 13;
60
61  let PostRAScheduler = 1; // Enable Post RegAlloc Scheduler pass.
62
63  let CompleteModel = 1;
64}
65
66let SchedModel = Znver3Model in {
67
68
69//===----------------------------------------------------------------------===//
70// RCU
71//===----------------------------------------------------------------------===//
72
73// AMD SOG 19h, 2.10.3 Retire Control Unit
74// The unit can receive up to 6 macro ops dispatched per cycle and track up to
75// 256 macro ops in-flight in non-SMT mode or 128 per thread in SMT mode. <...>
76// The retire unit handles in-order commit of up to eight macro ops per cycle.
77def Zn3RCU : RetireControlUnit<Znver3Model.MicroOpBufferSize, 8>;
78
79//===----------------------------------------------------------------------===//
80// Units
81//===----------------------------------------------------------------------===//
82
83// There are total of three Units, each one with it's own schedulers.
84
85//===----------------------------------------------------------------------===//
86// Integer Execution Unit
87//
88
89// AMD SOG 19h, 2.4 Superscalar Organization
90// The processor uses four decoupled independent integer scheduler queues,
91// each one servicing one ALU pipeline and one or two other pipelines
92
93//
94// Execution pipes
95//===----------------------------------------------------------------------===//
96
97// AMD SOG 19h, 2.10.2 Execution Units
98// The processor contains 4 general purpose integer execution pipes.
99// Each pipe has an ALU capable of general purpose integer operations.
100def Zn3ALU0 : ProcResource<1>;
101def Zn3ALU1 : ProcResource<1>;
102def Zn3ALU2 : ProcResource<1>;
103def Zn3ALU3 : ProcResource<1>;
104
105// AMD SOG 19h, 2.10.2 Execution Units
106// There is also a separate branch execution unit.
107def Zn3BRU1 : ProcResource<1>;
108
109// AMD SOG 19h, 2.10.2 Execution Units
110// There are three Address Generation Units (AGUs) for all load and store
111// address generation. There are also 3 store data movement units
112// associated with the same schedulers as the AGUs.
113def Zn3AGU0 : ProcResource<1>;
114def Zn3AGU1 : ProcResource<1>;
115def Zn3AGU2 : ProcResource<1>;
116
117//
118// Execution Units
119//===----------------------------------------------------------------------===//
120
121// AMD SOG 19h, 2.10.2 Execution Units
122// ALU0 additionally has divide <...> execution capability.
123defvar Zn3Divider = Zn3ALU0;
124
125// AMD SOG 19h, 2.10.2 Execution Units
126// ALU0 additionally has <...> branch execution capability.
127defvar Zn3BRU0 = Zn3ALU0;
128
129// Integer Multiplication issued on ALU1.
130defvar Zn3Multiplier = Zn3ALU1;
131
132// Execution pipeline grouping
133//===----------------------------------------------------------------------===//
134
135// General ALU operations
136def Zn3ALU0123 : ProcResGroup<[Zn3ALU0, Zn3ALU1, Zn3ALU2, Zn3ALU3]>;
137
138// General AGU operations
139def Zn3AGU012 : ProcResGroup<[Zn3AGU0, Zn3AGU1, Zn3AGU2]>;
140
141// Control flow: jumps, calls
142def Zn3BRU01 : ProcResGroup<[Zn3BRU0, Zn3BRU1]>;
143
144// Everything that isn't control flow, but still needs to access CC register,
145// namely: conditional moves, SETcc.
146def Zn3ALU03 : ProcResGroup<[Zn3ALU0, Zn3ALU3]>;
147
148// Zn3ALU1 handles complex bit twiddling: CRC/PDEP/PEXT
149
150// Simple bit twiddling: bit test, shift/rotate, bit extraction
151def Zn3ALU12 : ProcResGroup<[Zn3ALU1, Zn3ALU2]>;
152
153
154//
155// Scheduling
156//===----------------------------------------------------------------------===//
157
158// AMD SOG 19h, 2.10.3 Retire Control Unit
159// The integer physical register file (PRF) consists of 192 registers.
160def Zn3IntegerPRF : RegisterFile<192, [GR64, CCR], [1, 1], [1, 0],
161                              6,  // Max moves that can be eliminated per cycle.
162                              0>; // Restrict move elimination to zero regs.
163
164// anandtech, The integer scheduler has a 4*24 entry macro op capacity.
165// AMD SOG 19h, 2.10.1 Schedulers
166// The schedulers can receive up to six macro ops per cycle, with a limit of
167// two per scheduler. Each scheduler can issue one micro op per cycle into
168// each of its associated pipelines
169// FIXME: these are 4 separate schedulers, not a single big one.
170def Zn3Int : ProcResGroup<[Zn3ALU0, Zn3AGU0, Zn3BRU0, // scheduler 0
171                           Zn3ALU1, Zn3AGU1,          // scheduler 1
172                           Zn3ALU2, Zn3AGU2,          // scheduler 2
173                           Zn3ALU3,          Zn3BRU1  // scheduler 3
174                          ]> {
175  let BufferSize = !mul(4, 24);
176}
177
178
179//===----------------------------------------------------------------------===//
180// Floating-Point Unit
181//
182
183// AMD SOG 19h, 2.4 Superscalar Organization
184// The processor uses <...> two decoupled independent floating point schedulers
185// each servicing two FP pipelines and one store or FP-to-integer pipeline.
186
187//
188// Execution pipes
189//===----------------------------------------------------------------------===//
190
191// AMD SOG 19h, 2.10.1 Schedulers
192// <...>, and six FPU pipes.
193// Agner, 22.10 Floating point execution pipes
194// There are six floating point/vector execution pipes,
195def Zn3FPP0  : ProcResource<1>;
196def Zn3FPP1  : ProcResource<1>;
197def Zn3FPP2  : ProcResource<1>;
198def Zn3FPP3  : ProcResource<1>;
199def Zn3FPP45 : ProcResource<2>;
200
201//
202// Execution Units
203//===----------------------------------------------------------------------===//
204// AMD SOG 19h, 2.11.1 Floating Point Execution Resources
205
206// (v)FMUL*, (v)FMA*, Floating Point Compares, Blendv(DQ)
207defvar Zn3FPFMul0 = Zn3FPP0;
208defvar Zn3FPFMul1 = Zn3FPP1;
209
210// (v)FADD*
211defvar Zn3FPFAdd0 = Zn3FPP2;
212defvar Zn3FPFAdd1 = Zn3FPP3;
213
214// All convert operations except pack/unpack
215defvar Zn3FPFCvt0 = Zn3FPP2;
216defvar Zn3FPFCvt1 = Zn3FPP3;
217
218// All Divide and Square Root except Reciprocal Approximation
219// AMD SOG 19h, 2.11.1 Floating Point Execution Resources
220// FDIV unit can support 2 simultaneous operations in flight
221// even though it occupies a single pipe.
222// FIXME: BufferSize=2 ?
223defvar Zn3FPFDiv = Zn3FPP1;
224
225// Moves and Logical operations on Floating Point Data Types
226defvar Zn3FPFMisc0 = Zn3FPP0;
227defvar Zn3FPFMisc1 = Zn3FPP1;
228defvar Zn3FPFMisc2 = Zn3FPP2;
229defvar Zn3FPFMisc3 = Zn3FPP3;
230
231// Integer Adds, Subtracts, and Compares
232// Some complex VADD operations are not available in all pipes.
233defvar Zn3FPVAdd0 = Zn3FPP0;
234defvar Zn3FPVAdd1 = Zn3FPP1;
235defvar Zn3FPVAdd2 = Zn3FPP2;
236defvar Zn3FPVAdd3 = Zn3FPP3;
237
238// Integer Multiplies, SAD, Blendvb
239defvar Zn3FPVMul0 = Zn3FPP0;
240defvar Zn3FPVMul1 = Zn3FPP3;
241
242// Data Shuffles, Packs, Unpacks, Permute
243// Some complex shuffle operations are only available in pipe1.
244defvar Zn3FPVShuf = Zn3FPP1;
245defvar Zn3FPVShufAux = Zn3FPP2;
246
247// Bit Shift Left/Right operations
248defvar Zn3FPVShift0 = Zn3FPP1;
249defvar Zn3FPVShift1 = Zn3FPP2;
250
251// Moves and Logical operations on Packed Integer Data Types
252defvar Zn3FPVMisc0 = Zn3FPP0;
253defvar Zn3FPVMisc1 = Zn3FPP1;
254defvar Zn3FPVMisc2 = Zn3FPP2;
255defvar Zn3FPVMisc3 = Zn3FPP3;
256
257// *AES*
258defvar Zn3FPAES0 = Zn3FPP0;
259defvar Zn3FPAES1 = Zn3FPP1;
260
261// *CLM*
262defvar Zn3FPCLM0 = Zn3FPP0;
263defvar Zn3FPCLM1 = Zn3FPP1;
264
265// Execution pipeline grouping
266//===----------------------------------------------------------------------===//
267
268// AMD SOG 19h, 2.11 Floating-Point Unit
269// Stores and floating point to general purpose register transfer
270// have 2 dedicated pipelines (pipe 5 and 6).
271def Zn3FPU0123 : ProcResGroup<[Zn3FPP0, Zn3FPP1, Zn3FPP2, Zn3FPP3]>;
272
273// (v)FMUL*, (v)FMA*, Floating Point Compares, Blendv(DQ)
274def Zn3FPFMul01 : ProcResGroup<[Zn3FPFMul0, Zn3FPFMul1]>;
275
276// (v)FADD*
277// Some complex VADD operations are not available in all pipes.
278def Zn3FPFAdd01 : ProcResGroup<[Zn3FPFAdd0, Zn3FPFAdd1]>;
279
280// All convert operations except pack/unpack
281def Zn3FPFCvt01 : ProcResGroup<[Zn3FPFCvt0, Zn3FPFCvt1]>;
282
283// All Divide and Square Root except Reciprocal Approximation
284// def Zn3FPFDiv : ProcResGroup<[Zn3FPFDiv]>;
285
286// Moves and Logical operations on Floating Point Data Types
287def Zn3FPFMisc0123 : ProcResGroup<[Zn3FPFMisc0, Zn3FPFMisc1, Zn3FPFMisc2, Zn3FPFMisc3]>;
288
289def Zn3FPFMisc12 : ProcResGroup<[Zn3FPFMisc1, Zn3FPFMisc2]>;
290
291// Loads, Stores and Move to General Register (EX) Operations
292// AMD SOG 19h, 2.11 Floating-Point Unit
293// Stores and floating point to general purpose register transfer
294// have 2 dedicated pipelines (pipe 5 and 6).
295defvar Zn3FPLd01 = Zn3FPP45;
296
297// AMD SOG 19h, 2.11 Floating-Point Unit
298// Note that FP stores are supported on two pipelines,
299// but throughput is limited to one per cycle.
300let Super = Zn3FPP45 in
301def Zn3FPSt : ProcResource<1>;
302
303// Integer Adds, Subtracts, and Compares
304// Some complex VADD operations are not available in all pipes.
305def Zn3FPVAdd0123 : ProcResGroup<[Zn3FPVAdd0, Zn3FPVAdd1, Zn3FPVAdd2, Zn3FPVAdd3]>;
306
307def Zn3FPVAdd01: ProcResGroup<[Zn3FPVAdd0, Zn3FPVAdd1]>;
308def Zn3FPVAdd12: ProcResGroup<[Zn3FPVAdd1, Zn3FPVAdd2]>;
309
310// Integer Multiplies, SAD, Blendvb
311def Zn3FPVMul01 : ProcResGroup<[Zn3FPVMul0, Zn3FPVMul1]>;
312
313// Data Shuffles, Packs, Unpacks, Permute
314// Some complex shuffle operations are only available in pipe1.
315def Zn3FPVShuf01 : ProcResGroup<[Zn3FPVShuf, Zn3FPVShufAux]>;
316
317// Bit Shift Left/Right operations
318def Zn3FPVShift01 : ProcResGroup<[Zn3FPVShift0, Zn3FPVShift1]>;
319
320// Moves and Logical operations on Packed Integer Data Types
321def Zn3FPVMisc0123 : ProcResGroup<[Zn3FPVMisc0, Zn3FPVMisc1, Zn3FPVMisc2, Zn3FPVMisc3]>;
322
323// *AES*
324def Zn3FPAES01 : ProcResGroup<[Zn3FPAES0, Zn3FPAES1]>;
325
326// *CLM*
327def Zn3FPCLM01 : ProcResGroup<[Zn3FPCLM0, Zn3FPCLM1]>;
328
329
330//
331// Scheduling
332//===----------------------------------------------------------------------===//
333
334// Agner, 21.8 Register renaming and out-of-order schedulers
335// The floating point register file has 160 vector registers
336// of 128 bits each in Zen 1 and 256 bits each in Zen 2.
337// anandtech also confirms this.
338def Zn3FpPRF : RegisterFile<160, [VR64, VR128, VR256], [1, 1, 1], [0, 1, 1],
339                            6,  // Max moves that can be eliminated per cycle.
340                            0>; // Restrict move elimination to zero regs.
341
342// AMD SOG 19h, 2.11 Floating-Point Unit
343// The floating-point scheduler has a 2*32 entry macro op capacity.
344// AMD SOG 19h, 2.11 Floating-Point Unit
345// <...> the scheduler can issue 1 micro op per cycle for each pipe.
346// FIXME: those are two separate schedulers, not a single big one.
347def Zn3FP : ProcResGroup<[Zn3FPP0, Zn3FPP2,          /*Zn3FPP4,*/ // scheduler 0
348                          Zn3FPP1, Zn3FPP3, Zn3FPP45 /*Zn3FPP5*/  // scheduler 1
349                         ]> {
350  let BufferSize = !mul(2, 32);
351}
352
353// AMD SOG 19h, 2.11 Floating-Point Unit
354// Macro ops can be dispatched to the 64 entry Non Scheduling Queue (NSQ)
355// even if floating-point scheduler is full.
356// FIXME: how to model this properly?
357
358
359//===----------------------------------------------------------------------===//
360// Load-Store Unit
361//
362
363// AMD SOG 19h, 2.12 Load-Store Unit
364// The LS unit contains three largely independent pipe-lines
365// enabling the execution of three 256-bit memory operations per cycle.
366def Zn3LSU : ProcResource<3>;
367
368// AMD SOG 19h, 2.12 Load-Store Unit
369// All three memory operations can be loads.
370let Super = Zn3LSU in
371def Zn3Load : ProcResource<3> {
372  // AMD SOG 19h, 2.12 Load-Store Unit
373  // The LS unit can process up to 72 out-of-order loads.
374  let BufferSize = 72;
375}
376
377def Zn3LoadQueue : LoadQueue<Zn3Load>;
378
379// AMD SOG 19h, 2.12 Load-Store Unit
380// A maximum of two of the memory operations can be stores.
381let Super = Zn3LSU in
382def Zn3Store : ProcResource<2> {
383  // AMD SOG 19h, 2.12 Load-Store Unit
384  // The LS unit utilizes a 64-entry store queue (STQ).
385  let BufferSize = 64;
386}
387
388def Zn3StoreQueue : StoreQueue<Zn3Store>;
389
390//===----------------------------------------------------------------------===//
391// Basic helper classes.
392//===----------------------------------------------------------------------===//
393
394// Many SchedWrites are defined in pairs with and without a folded load.
395// Instructions with folded loads are usually micro-fused, so they only appear
396// as two micro-ops when dispatched by the schedulers.
397// This multiclass defines the resource usage for variants with and without
398// folded loads.
399
400multiclass __zn3WriteRes<SchedWrite SchedRW, list<ProcResourceKind> ExePorts,
401                         int Lat = 1, list<int> Res = [], int UOps = 1> {
402  def : WriteRes<SchedRW, ExePorts> {
403    let Latency = Lat;
404    let ResourceCycles = Res;
405    let NumMicroOps = UOps;
406  }
407}
408
409multiclass __zn3WriteResPair<X86FoldableSchedWrite SchedRW,
410                             list<ProcResourceKind> ExePorts, int Lat,
411                             list<int> Res, int UOps, int LoadLat, int LoadUOps,
412                             ProcResourceKind AGU, int LoadRes> {
413  defm : __zn3WriteRes<SchedRW, ExePorts, Lat, Res, UOps>;
414
415  defm : __zn3WriteRes<SchedRW.Folded,
416                       !listconcat([AGU, Zn3Load], ExePorts),
417                       !add(Lat, LoadLat),
418                       !if(!and(!empty(Res), !eq(LoadRes, 1)),
419                         [],
420                         !listconcat([1, LoadRes],
421                           !if(!empty(Res),
422                             !listsplat(1, !size(ExePorts)),
423                             Res))),
424                       !add(UOps, LoadUOps)>;
425}
426
427// For classes without folded loads.
428multiclass Zn3WriteResInt<SchedWrite SchedRW,
429                          list<ProcResourceKind> ExePorts, int Lat = 1,
430                          list<int> Res = [], int UOps = 1> {
431  defm : __zn3WriteRes<SchedRW, ExePorts, Lat, Res, UOps>;
432}
433
434multiclass Zn3WriteResXMM<SchedWrite SchedRW,
435                          list<ProcResourceKind> ExePorts, int Lat = 1,
436                          list<int> Res = [], int UOps = 1> {
437  defm : __zn3WriteRes<SchedRW, ExePorts, Lat, Res, UOps>;
438}
439
440multiclass Zn3WriteResYMM<SchedWrite SchedRW,
441                          list<ProcResourceKind> ExePorts, int Lat = 1,
442                          list<int> Res = [], int UOps = 1> {
443  defm : __zn3WriteRes<SchedRW, ExePorts, Lat, Res, UOps>;
444}
445
446// For classes with folded loads.
447multiclass Zn3WriteResIntPair<X86FoldableSchedWrite SchedRW,
448                              list<ProcResourceKind> ExePorts, int Lat = 1,
449                              list<int> Res = [], int UOps = 1,
450                              int LoadUOps = 0, int LoadRes = 1> {
451  defm : __zn3WriteResPair<SchedRW, ExePorts, Lat, Res, UOps,
452                           Znver3Model.LoadLatency,
453                           LoadUOps, Zn3AGU012, LoadRes>;
454}
455
456multiclass Zn3WriteResXMMPair<X86FoldableSchedWrite SchedRW,
457                              list<ProcResourceKind> ExePorts, int Lat = 1,
458                              list<int> Res = [], int UOps = 1,
459                              int LoadUOps = 0, int LoadRes = 1> {
460  defm : __zn3WriteResPair<SchedRW, ExePorts, Lat, Res, UOps,
461                           Znver3Model.VecLoadLatency,
462                           LoadUOps, Zn3FPLd01, LoadRes>;
463}
464
465multiclass Zn3WriteResYMMPair<X86FoldableSchedWrite SchedRW,
466                              list<ProcResourceKind> ExePorts, int Lat = 1,
467                              list<int> Res = [], int UOps = 1,
468                              int LoadUOps = 0, int LoadRes = 1> {
469  defm : __zn3WriteResPair<SchedRW, ExePorts, Lat, Res, UOps,
470                           Znver3Model.VecLoadLatency,
471                           LoadUOps, Zn3FPLd01, LoadRes>;
472}
473
474
475//===----------------------------------------------------------------------===//
476// Here be dragons.
477//===----------------------------------------------------------------------===//
478
479def : ReadAdvance<ReadAfterLd, Znver3Model.LoadLatency>;
480
481def : ReadAdvance<ReadAfterVecLd, Znver3Model.VecLoadLatency>;
482def : ReadAdvance<ReadAfterVecXLd, Znver3Model.VecLoadLatency>;
483def : ReadAdvance<ReadAfterVecYLd, Znver3Model.VecLoadLatency>;
484
485// AMD SOG 19h, 2.11 Floating-Point Unit
486// There is 1 cycle of added latency for a result to cross
487// from F to I or I to F domain.
488def : ReadAdvance<ReadInt2Fpu, -1>;
489
490// Instructions with both a load and a store folded are modeled as a folded
491// load + WriteRMW.
492defm : Zn3WriteResInt<WriteRMW, [Zn3AGU012, Zn3Store], Znver3Model.StoreLatency, [1, 1], 0>;
493
494// Loads, stores, and moves, not folded with other operations.
495defm : Zn3WriteResInt<WriteLoad, [Zn3AGU012, Zn3Load], !add(Znver3Model.LoadLatency, 1), [1, 1], 1>;
496
497def Zn3WriteMOVSlow : SchedWriteRes<[Zn3AGU012, Zn3Load]> {
498  let Latency = !add(Znver3Model.LoadLatency, 1);
499  let ResourceCycles = [3, 1];
500  let NumMicroOps = 1;
501}
502def : InstRW<[Zn3WriteMOVSlow], (instrs MOV8rm, MOV8rm_NOREX, MOV16rm, MOVSX16rm16, MOVSX16rm32, MOVZX16rm16, MOVSX16rm8, MOVZX16rm8)>;
503
504defm : Zn3WriteResInt<WriteStore, [Zn3AGU012, Zn3Store], Znver3Model.StoreLatency, [1, 2], 1>;
505defm : Zn3WriteResInt<WriteStoreNT, [Zn3AGU012, Zn3Store], Znver3Model.StoreLatency, [1, 2], 1>;
506defm : Zn3WriteResInt<WriteMove, [Zn3ALU0123], 1, [4], 1>;
507
508// Treat misc copies as a move.
509def : InstRW<[WriteMove], (instrs COPY)>;
510
511def Zn3WriteMOVBE16rm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3ALU0123]> {
512  let Latency = Znver3Model.LoadLatency;
513  let ResourceCycles = [1, 1, 4];
514  let NumMicroOps = 1;
515}
516def : InstRW<[Zn3WriteMOVBE16rm], (instrs MOVBE16rm)>;
517
518def Zn3WriteMOVBEmr : SchedWriteRes<[Zn3ALU0123, Zn3AGU012, Zn3Store]> {
519  let Latency = Znver3Model.StoreLatency;
520  let ResourceCycles = [4, 1, 1];
521  let NumMicroOps = 2;
522}
523def : InstRW<[Zn3WriteMOVBEmr], (instrs MOVBE16mr, MOVBE32mr, MOVBE64mr)>;
524
525// Arithmetic.
526defm : Zn3WriteResIntPair<WriteALU, [Zn3ALU0123], 1, [1], 1>; // Simple integer ALU op.
527
528def Zn3WriteALUSlow : SchedWriteRes<[Zn3ALU0123]> {
529  let Latency = 1;
530  let ResourceCycles = [4];
531  let NumMicroOps = 1;
532}
533def : InstRW<[Zn3WriteALUSlow], (instrs ADD8i8, ADD16i16, ADD32i32, ADD64i32,
534                                        AND8i8, AND16i16, AND32i32, AND64i32,
535                                         OR8i8,  OR16i16,  OR32i32,  OR64i32,
536                                        SUB8i8, SUB16i16, SUB32i32, SUB64i32,
537                                        XOR8i8, XOR16i16, XOR32i32, XOR64i32)>;
538
539def Zn3WriteMoveExtend : SchedWriteRes<[Zn3ALU0123]> {
540  let Latency = 1;
541  let ResourceCycles = [4];
542  let NumMicroOps = 1;
543}
544def : InstRW<[Zn3WriteMoveExtend], (instrs MOVSX16rr16, MOVSX16rr32, MOVZX16rr16, MOVSX16rr8, MOVZX16rr8)>;
545
546def Zn3WriteMaterialize32bitImm: SchedWriteRes<[Zn3ALU0123]> {
547  let Latency = 1;
548  let ResourceCycles = [2];
549  let NumMicroOps = 1;
550}
551def : InstRW<[Zn3WriteMaterialize32bitImm], (instrs MOV32ri, MOV32ri_alt, MOV64ri32)>;
552
553def Zn3WritePDEP_PEXT : SchedWriteRes<[Zn3ALU1]> {
554  let Latency = 3;
555  let ResourceCycles = [1];
556  let NumMicroOps = 1;
557}
558def : InstRW<[Zn3WritePDEP_PEXT], (instrs PDEP32rr, PDEP64rr,
559                                          PEXT32rr, PEXT64rr)>;
560
561defm : Zn3WriteResIntPair<WriteADC, [Zn3ALU0123], 1, [4], 1>; // Integer ALU + flags op.
562
563def Zn3WriteADC8mr_SBB8mr : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3ALU0123, Zn3Store]> {
564  let Latency = 1;
565  let ResourceCycles = [1, 1, 7, 1];
566  let NumMicroOps = 1;
567}
568def : InstRW<[Zn3WriteADC8mr_SBB8mr], (instrs ADC8mr, SBB8mr)>;
569
570// This is for simple LEAs with one or two input operands.
571defm : Zn3WriteResInt<WriteLEA, [Zn3AGU012], 1, [1], 1>;     // LEA instructions can't fold loads.
572
573// This write is used for slow LEA instructions.
574def Zn3Write3OpsLEA : SchedWriteRes<[Zn3ALU0123]> {
575  let Latency = 2;
576  let ResourceCycles = [1];
577  let NumMicroOps = 2;
578}
579
580// On Piledriver, a slow LEA is either a 3Ops LEA (base, index, offset),
581// or an LEA with a `Scale` value different than 1.
582def Zn3SlowLEAPredicate : MCSchedPredicate<
583  CheckAny<[
584    // A 3-operand LEA (base, index, offset).
585    IsThreeOperandsLEAFn,
586    // An LEA with a "Scale" different than 1.
587    CheckAll<[
588      CheckIsImmOperand<2>,
589      CheckNot<CheckImmOperand<2, 1>>
590    ]>
591  ]>
592>;
593
594def Zn3WriteLEA : SchedWriteVariant<[
595    SchedVar<Zn3SlowLEAPredicate, [Zn3Write3OpsLEA]>,
596    SchedVar<NoSchedPred,         [WriteLEA]>
597]>;
598
599def : InstRW<[Zn3WriteLEA], (instrs LEA32r, LEA64r, LEA64_32r)>;
600
601def Zn3SlowLEA16r : SchedWriteRes<[Zn3ALU0123]> {
602  let Latency = 2; // FIXME: not from llvm-exegesis
603  let ResourceCycles = [4];
604  let NumMicroOps = 2;
605}
606
607def : InstRW<[Zn3SlowLEA16r], (instrs LEA16r)>;
608
609// Integer multiplication
610defm : Zn3WriteResIntPair<WriteIMul8, [Zn3Multiplier], 3, [3], 1>; // Integer 8-bit multiplication.
611defm : Zn3WriteResIntPair<WriteIMul16, [Zn3Multiplier], 3, [3], 3, /*LoadUOps=*/1>; // Integer 16-bit multiplication.
612defm : Zn3WriteResIntPair<WriteIMul16Imm, [Zn3Multiplier], 4, [4], 2>; // Integer 16-bit multiplication by immediate.
613defm : Zn3WriteResIntPair<WriteIMul16Reg, [Zn3Multiplier], 3, [1], 1>; // Integer 16-bit multiplication by register.
614defm : Zn3WriteResIntPair<WriteIMul32, [Zn3Multiplier], 3, [3], 2>;    // Integer 32-bit multiplication.
615
616def Zn3MULX32rr : SchedWriteRes<[Zn3Multiplier]> {
617  let Latency = 4;
618  let ResourceCycles = [1];
619  let NumMicroOps = 2;
620}
621def : InstRW<[Zn3MULX32rr, WriteIMulH], (instrs MULX32rr)>;
622
623def Zn3MULX32rm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3Multiplier]> {
624  let Latency = !add(Znver3Model.LoadLatency, Zn3MULX32rr.Latency);
625  let ResourceCycles = [1, 1, 2];
626  let NumMicroOps = Zn3MULX32rr.NumMicroOps;
627}
628def : InstRW<[Zn3MULX32rm, WriteIMulH], (instrs MULX32rm)>;
629
630defm : Zn3WriteResIntPair<WriteIMul32Imm, [Zn3Multiplier], 3, [1], 1>; // Integer 32-bit multiplication by immediate.
631defm : Zn3WriteResIntPair<WriteIMul32Reg, [Zn3Multiplier], 3, [1], 1>; // Integer 32-bit multiplication by register.
632defm : Zn3WriteResIntPair<WriteIMul64, [Zn3Multiplier], 3, [3], 2>;    // Integer 64-bit multiplication.
633
634def Zn3MULX64rr : SchedWriteRes<[Zn3Multiplier]> {
635  let Latency = 4;
636  let ResourceCycles = [1];
637  let NumMicroOps = 2;
638}
639def : InstRW<[Zn3MULX64rr, WriteIMulH], (instrs MULX64rr)>;
640
641def Zn3MULX64rm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3Multiplier]> {
642  let Latency = !add(Znver3Model.LoadLatency, Zn3MULX64rr.Latency);
643  let ResourceCycles = [1, 1, 2];
644  let NumMicroOps = Zn3MULX64rr.NumMicroOps;
645}
646def : InstRW<[Zn3MULX64rm, WriteIMulH], (instrs MULX64rm)>;
647
648defm : Zn3WriteResIntPair<WriteIMul64Imm, [Zn3Multiplier], 3, [1], 1>; // Integer 64-bit multiplication by immediate.
649defm : Zn3WriteResIntPair<WriteIMul64Reg, [Zn3Multiplier], 3, [1], 1>; // Integer 64-bit multiplication by register.
650defm : Zn3WriteResInt<WriteIMulH, [], 4, [], 0>;         // Integer multiplication, high part.
651
652defm : Zn3WriteResInt<WriteBSWAP32, [Zn3ALU0123], 1, [1], 1>; // Byte Order (Endianness) 32-bit Swap.
653defm : Zn3WriteResInt<WriteBSWAP64, [Zn3ALU0123], 1, [1], 1>; // Byte Order (Endianness) 64-bit Swap.
654
655defm : Zn3WriteResIntPair<WriteCMPXCHG, [Zn3ALU0123], 3, [12], 5>; // Compare and set, compare and swap.
656
657def Zn3WriteCMPXCHG8rr : SchedWriteRes<[Zn3ALU0123]> {
658  let Latency = 3;
659  let ResourceCycles = [12];
660  let NumMicroOps = 3;
661}
662def : InstRW<[Zn3WriteCMPXCHG8rr], (instrs CMPXCHG8rr)>;
663
664defm : Zn3WriteResInt<WriteCMPXCHGRMW, [Zn3ALU0123], 3, [12], 6>;     // Compare and set, compare and swap.
665
666def Zn3WriteCMPXCHG8rm_LCMPXCHG8 : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3ALU0123]> {
667  let Latency = !add(Znver3Model.LoadLatency, Zn3WriteCMPXCHG8rr.Latency);
668  let ResourceCycles = [1, 1, 12];
669  let NumMicroOps = !add(Zn3WriteCMPXCHG8rr.NumMicroOps, 2);
670}
671def : InstRW<[Zn3WriteCMPXCHG8rm_LCMPXCHG8], (instrs CMPXCHG8rm, LCMPXCHG8)>;
672
673def Zn3WriteCMPXCHG8B : SchedWriteRes<[Zn3ALU0123]> {
674  let Latency = 3; // FIXME: not from llvm-exegesis
675  let ResourceCycles = [24];
676  let NumMicroOps = 19;
677}
678def : InstRW<[Zn3WriteCMPXCHG8B], (instrs CMPXCHG8B)>;
679
680def Zn3WriteCMPXCHG16B_LCMPXCHG16B : SchedWriteRes<[Zn3ALU0123]> {
681  let Latency = 4; // FIXME: not from llvm-exegesis
682  let ResourceCycles = [59];
683  let NumMicroOps = 28;
684}
685def : InstRW<[Zn3WriteCMPXCHG16B_LCMPXCHG16B], (instrs CMPXCHG16B, LCMPXCHG16B)>;
686
687def Zn3WriteWriteXCHGUnrenameable : SchedWriteRes<[Zn3ALU0123]> {
688  let Latency = 1;
689  let ResourceCycles = [2];
690  let NumMicroOps = 2;
691}
692def : InstRW<[Zn3WriteWriteXCHGUnrenameable], (instrs XCHG8rr, XCHG16rr, XCHG16ar)>;
693
694def Zn3WriteXCHG8rm_XCHG16rm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3ALU0123]> {
695  let Latency = !add(Znver3Model.LoadLatency, 3); // FIXME: not from llvm-exegesis
696  let ResourceCycles = [1, 1, 2];
697  let NumMicroOps = 5;
698}
699def : InstRW<[Zn3WriteXCHG8rm_XCHG16rm], (instrs XCHG8rm, XCHG16rm)>;
700
701def Zn3WriteXCHG32rm_XCHG64rm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3ALU0123]> {
702  let Latency = !add(Znver3Model.LoadLatency, 2); // FIXME: not from llvm-exegesis
703  let ResourceCycles = [1, 1, 2];
704  let NumMicroOps = 2;
705}
706def : InstRW<[Zn3WriteXCHG32rm_XCHG64rm], (instrs XCHG32rm, XCHG64rm)>;
707
708// Integer division.
709// FIXME: uops for 8-bit division measures as 2. for others it's a guess.
710// FIXME: latency for 8-bit division measures as 10. for others it's a guess.
711defm : Zn3WriteResIntPair<WriteDiv8, [Zn3Divider], 10, [10], 2>;
712defm : Zn3WriteResIntPair<WriteDiv16, [Zn3Divider], 11, [11], 2>;
713defm : Zn3WriteResIntPair<WriteDiv32, [Zn3Divider], 13, [13], 2>;
714defm : Zn3WriteResIntPair<WriteDiv64, [Zn3Divider], 17, [17], 2>;
715defm : Zn3WriteResIntPair<WriteIDiv8, [Zn3Divider], 10, [10], 2>;
716defm : Zn3WriteResIntPair<WriteIDiv16, [Zn3Divider], 11, [11], 2>;
717defm : Zn3WriteResIntPair<WriteIDiv32, [Zn3Divider], 13, [13], 2>;
718defm : Zn3WriteResIntPair<WriteIDiv64, [Zn3Divider], 17, [17], 2>;
719
720defm : Zn3WriteResIntPair<WriteBSF, [Zn3ALU1], 3, [3], 6, /*LoadUOps=*/2>; // Bit scan forward.
721defm : Zn3WriteResIntPair<WriteBSR, [Zn3ALU1], 4, [4], 6, /*LoadUOps=*/2>; // Bit scan reverse.
722
723defm : Zn3WriteResIntPair<WritePOPCNT, [Zn3ALU0123], 1, [1], 1>; // Bit population count.
724
725def Zn3WritePOPCNT16rr : SchedWriteRes<[Zn3ALU0123]> {
726  let Latency = 1;
727  let ResourceCycles = [4];
728  let NumMicroOps = 1;
729}
730def : InstRW<[Zn3WritePOPCNT16rr], (instrs POPCNT16rr)>;
731
732defm : Zn3WriteResIntPair<WriteLZCNT, [Zn3ALU0123], 1, [1], 1>; // Leading zero count.
733
734def Zn3WriteLZCNT16rr : SchedWriteRes<[Zn3ALU0123]> {
735  let Latency = 1;
736  let ResourceCycles = [4];
737  let NumMicroOps = 1;
738}
739def : InstRW<[Zn3WriteLZCNT16rr], (instrs LZCNT16rr)>;
740
741defm : Zn3WriteResIntPair<WriteTZCNT, [Zn3ALU12], 2, [1], 2>; // Trailing zero count.
742
743def Zn3WriteTZCNT16rr : SchedWriteRes<[Zn3ALU0123]> {
744  let Latency = 2;
745  let ResourceCycles = [4];
746  let NumMicroOps = 2;
747}
748def : InstRW<[Zn3WriteTZCNT16rr], (instrs TZCNT16rr)>;
749
750defm : Zn3WriteResIntPair<WriteCMOV, [Zn3ALU03], 1, [1], 1>; // Conditional move.
751defm : Zn3WriteResInt<WriteFCMOV, [Zn3ALU0123], 7, [28], 7>; // FIXME: not from llvm-exegesis // X87 conditional move.
752defm : Zn3WriteResInt<WriteSETCC, [Zn3ALU03], 1, [2], 1>; // Set register based on condition code.
753defm : Zn3WriteResInt<WriteSETCCStore, [Zn3ALU03, Zn3AGU012, Zn3Store], 2, [2, 1, 1], 2>; // FIXME: latency not from llvm-exegesis
754defm : Zn3WriteResInt<WriteLAHFSAHF, [Zn3ALU3], 1, [1], 1>; // Load/Store flags in AH.
755
756defm : Zn3WriteResInt<WriteBitTest, [Zn3ALU12], 1, [1], 1>; // Bit Test
757defm : Zn3WriteResInt<WriteBitTestImmLd, [Zn3AGU012, Zn3Load, Zn3ALU12], !add(Znver3Model.LoadLatency, 1), [1, 1, 1], 2>;
758defm : Zn3WriteResInt<WriteBitTestRegLd, [Zn3AGU012, Zn3Load, Zn3ALU12], !add(Znver3Model.LoadLatency, 1), [1, 1, 1], 7>;
759
760defm : Zn3WriteResInt<WriteBitTestSet, [Zn3ALU12], 2, [2], 2>; // Bit Test + Set
761defm : Zn3WriteResInt<WriteBitTestSetImmLd, [Zn3AGU012, Zn3Load, Zn3ALU12], !add(Znver3Model.LoadLatency, 2), [1, 1, 1], 4>;
762defm : Zn3WriteResInt<WriteBitTestSetRegLd, [Zn3AGU012, Zn3Load, Zn3ALU12], !add(Znver3Model.LoadLatency, 2), [1, 1, 1], 9>;
763
764// Integer shifts and rotates.
765defm : Zn3WriteResIntPair<WriteShift, [Zn3ALU12], 1, [1], 1, /*LoadUOps=*/1>;
766defm : Zn3WriteResIntPair<WriteShiftCL, [Zn3ALU12], 1, [1], 1, /*LoadUOps=*/1>;
767defm : Zn3WriteResIntPair<WriteRotate, [Zn3ALU12], 1, [1], 1, /*LoadUOps=*/1>;
768
769def Zn3WriteRotateR1 : SchedWriteRes<[Zn3ALU12]> {
770  let Latency = 1;
771  let ResourceCycles = [2];
772  let NumMicroOps = 1;
773}
774def : InstRW<[Zn3WriteRotateR1], (instrs RCL8r1, RCL16r1, RCL32r1, RCL64r1,
775                                         RCR8r1, RCR16r1, RCR32r1, RCR64r1)>;
776
777def Zn3WriteRotateM1 : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3ALU12]> {
778  let Latency = !add(Znver3Model.LoadLatency, Zn3WriteRotateR1.Latency);
779  let ResourceCycles = [1, 1, 2];
780  let NumMicroOps = !add(Zn3WriteRotateR1.NumMicroOps, 1);
781}
782def : InstRW<[Zn3WriteRotateM1], (instrs RCL8m1, RCL16m1, RCL32m1, RCL64m1,
783                                         RCR8m1, RCR16m1, RCR32m1, RCR64m1)>;
784
785def Zn3WriteRotateRightRI : SchedWriteRes<[Zn3ALU12]> {
786  let Latency = 3;
787  let ResourceCycles = [6];
788  let NumMicroOps = 7;
789}
790def : InstRW<[Zn3WriteRotateRightRI], (instrs RCR8ri, RCR16ri, RCR32ri, RCR64ri)>;
791
792def Zn3WriteRotateRightMI : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3ALU12]> {
793  let Latency = !add(Znver3Model.LoadLatency, Zn3WriteRotateRightRI.Latency);
794  let ResourceCycles = [1, 1, 8];
795  let NumMicroOps = !add(Zn3WriteRotateRightRI.NumMicroOps, 3);
796}
797def : InstRW<[Zn3WriteRotateRightMI], (instrs RCR8mi, RCR16mi, RCR32mi, RCR64mi)>;
798
799def Zn3WriteRotateLeftRI : SchedWriteRes<[Zn3ALU12]> {
800  let Latency = 4;
801  let ResourceCycles = [8];
802  let NumMicroOps = 9;
803}
804def : InstRW<[Zn3WriteRotateLeftRI], (instrs RCL8ri, RCL16ri, RCL32ri, RCL64ri)>;
805
806def Zn3WriteRotateLeftMI : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3ALU12]> {
807  let Latency = !add(Znver3Model.LoadLatency, Zn3WriteRotateLeftRI.Latency);
808  let ResourceCycles = [1, 1, 8];
809  let NumMicroOps = !add(Zn3WriteRotateLeftRI.NumMicroOps, 2);
810}
811def : InstRW<[Zn3WriteRotateLeftMI], (instrs RCL8mi, RCL16mi, RCL32mi, RCL64mi)>;
812
813defm : Zn3WriteResIntPair<WriteRotateCL, [Zn3ALU12], 1, [1], 1, /*LoadUOps=*/1>;
814
815def Zn3WriteRotateRightRCL : SchedWriteRes<[Zn3ALU12]> {
816  let Latency = 3;
817  let ResourceCycles = [6];
818  let NumMicroOps = 7;
819}
820def : InstRW<[Zn3WriteRotateRightRCL], (instrs RCR8rCL, RCR16rCL, RCR32rCL, RCR64rCL)>;
821
822def Zn3WriteRotateRightMCL : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3ALU12]> {
823  let Latency = !add(Znver3Model.LoadLatency, Zn3WriteRotateRightRCL.Latency);
824  let ResourceCycles = [1, 1, 8];
825  let NumMicroOps = !add(Zn3WriteRotateRightRCL.NumMicroOps, 2);
826}
827def : InstRW<[Zn3WriteRotateRightMCL], (instrs RCR8mCL, RCR16mCL, RCR32mCL, RCR64mCL)>;
828
829def Zn3WriteRotateLeftRCL : SchedWriteRes<[Zn3ALU12]> {
830  let Latency = 4;
831  let ResourceCycles = [8];
832  let NumMicroOps = 9;
833}
834def : InstRW<[Zn3WriteRotateLeftRCL], (instrs RCL8rCL, RCL16rCL, RCL32rCL, RCL64rCL)>;
835
836def Zn3WriteRotateLeftMCL : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3ALU12]> {
837  let Latency = !add(Znver3Model.LoadLatency, Zn3WriteRotateLeftRCL.Latency);
838  let ResourceCycles = [1, 1, 8];
839  let NumMicroOps = !add(Zn3WriteRotateLeftRCL.NumMicroOps, 2);
840}
841def : InstRW<[Zn3WriteRotateLeftMCL], (instrs RCL8mCL, RCL16mCL, RCL32mCL, RCL64mCL)>;
842
843// Double shift instructions.
844defm : Zn3WriteResInt<WriteSHDrri, [Zn3ALU12], 2, [3], 4>;
845defm : Zn3WriteResInt<WriteSHDrrcl, [Zn3ALU12], 2, [3], 5>;
846defm : Zn3WriteResInt<WriteSHDmri, [Zn3AGU012, Zn3Load, Zn3ALU12], !add(Znver3Model.LoadLatency, 2), [1, 1, 4], 6>;
847defm : Zn3WriteResInt<WriteSHDmrcl, [Zn3AGU012, Zn3Load, Zn3ALU12], !add(Znver3Model.LoadLatency, 2), [1, 1, 4], 6>;
848
849// BMI1 BEXTR/BLS, BMI2 BZHI
850defm : Zn3WriteResIntPair<WriteBEXTR, [Zn3ALU12], 1, [1], 1, /*LoadUOps=*/1>;
851defm : Zn3WriteResIntPair<WriteBLS, [Zn3ALU0123], 2, [2], 2, /*LoadUOps=*/1>;
852defm : Zn3WriteResIntPair<WriteBZHI, [Zn3ALU12], 1, [1], 1, /*LoadUOps=*/1>;
853
854// Idioms that clear a register, like xorps %xmm0, %xmm0.
855// These can often bypass execution ports completely.
856defm : Zn3WriteResInt<WriteZero, [Zn3ALU0123], 0, [0], 1>;
857
858// Branches don't produce values, so they have no latency, but they still
859// consume resources. Indirect branches can fold loads.
860defm : Zn3WriteResIntPair<WriteJump, [Zn3BRU01], 1, [1], 1>; // FIXME: not from llvm-exegesis
861
862// Floating point. This covers both scalar and vector operations.
863defm : Zn3WriteResInt<WriteFLD0, [Zn3FPLd01, Zn3Load, Zn3FPP1], !add(Znver3Model.LoadLatency, 4), [1, 1, 1], 1>;
864defm : Zn3WriteResInt<WriteFLD1, [Zn3FPLd01, Zn3Load, Zn3FPP1], !add(Znver3Model.LoadLatency, 7), [1, 1, 1], 1>;
865defm : Zn3WriteResInt<WriteFLDC, [Zn3FPLd01, Zn3Load, Zn3FPP1], !add(Znver3Model.LoadLatency, 7), [1, 1, 1], 1>;
866defm : Zn3WriteResXMM<WriteFLoad, [Zn3FPLd01, Zn3Load], !add(Znver3Model.VecLoadLatency, 1), [1, 1], 1>;
867defm : Zn3WriteResXMM<WriteFLoadX, [Zn3FPLd01, Zn3Load], !add(Znver3Model.VecLoadLatency, 1), [1, 1], 1>;
868defm : Zn3WriteResYMM<WriteFLoadY, [Zn3FPLd01, Zn3Load], !add(Znver3Model.VecLoadLatency, 1), [1, 1], 1>;
869defm : Zn3WriteResXMM<WriteFMaskedLoad, [Zn3FPLd01, Zn3Load], !add(Znver3Model.VecLoadLatency, 1), [1, 1], 1>;
870defm : Zn3WriteResYMM<WriteFMaskedLoadY, [Zn3FPLd01, Zn3Load], !add(Znver3Model.VecLoadLatency, 1), [1, 1], 1>;
871defm : Zn3WriteResXMM<WriteFStore, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [1, 1], 1>;
872
873def Zn3WriteWriteFStoreMMX : SchedWriteRes<[Zn3FPSt, Zn3Store]> {
874  let Latency = 2; // FIXME: not from llvm-exegesis
875  let ResourceCycles = [1, 1];
876  let NumMicroOps = 2;
877}
878def : InstRW<[Zn3WriteWriteFStoreMMX], (instrs MOVHPDmr,  MOVHPSmr,
879                                               VMOVHPDmr, VMOVHPSmr)>;
880
881defm : Zn3WriteResXMM<WriteFStoreX, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [1, 1], 1>;
882defm : Zn3WriteResYMM<WriteFStoreY, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [1, 1], 1>;
883defm : Zn3WriteResXMM<WriteFStoreNT, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [1, 1], 1>;
884defm : Zn3WriteResXMM<WriteFStoreNTX, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [1, 1], 1>;
885defm : Zn3WriteResYMM<WriteFStoreNTY, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [1, 1], 1>;
886
887defm : Zn3WriteResXMM<WriteFMaskedStore32, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [6, 1], 18>;
888defm : Zn3WriteResXMM<WriteFMaskedStore64, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [4, 1], 10>;
889defm : Zn3WriteResYMM<WriteFMaskedStore32Y, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [12, 1], 42>;
890defm : Zn3WriteResYMM<WriteFMaskedStore64Y, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [6, 1], 18>;
891
892defm : Zn3WriteResXMMPair<WriteFAdd, [Zn3FPFAdd01], 3, [1], 1>;  // Floating point add/sub.
893
894def Zn3WriteX87Arith : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPU0123]> {
895  let Latency = !add(Znver3Model.LoadLatency, 1); // FIXME: not from llvm-exegesis
896  let ResourceCycles = [1, 1, 24];
897  let NumMicroOps = 2;
898}
899def : InstRW<[Zn3WriteX87Arith], (instrs ADD_FI16m, ADD_FI32m,
900                                         SUB_FI16m, SUB_FI32m,
901                                         SUBR_FI16m, SUBR_FI32m,
902                                         MUL_FI16m, MUL_FI32m)>;
903
904def Zn3WriteX87Div : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPU0123]> {
905  let Latency = !add(Znver3Model.LoadLatency, 1); // FIXME: not from llvm-exegesis
906  let ResourceCycles = [1, 1, 62];
907  let NumMicroOps = 2;
908}
909def : InstRW<[Zn3WriteX87Div], (instrs DIV_FI16m, DIV_FI32m,
910                                       DIVR_FI16m, DIVR_FI32m)>;
911
912defm : Zn3WriteResXMMPair<WriteFAddX, [Zn3FPFAdd01], 3, [1], 1>; // Floating point add/sub (XMM).
913defm : Zn3WriteResYMMPair<WriteFAddY, [Zn3FPFAdd01], 3, [1], 1>; // Floating point add/sub (YMM).
914defm : X86WriteResPairUnsupported<WriteFAddZ>; // Floating point add/sub (ZMM).
915defm : Zn3WriteResXMMPair<WriteFAdd64, [Zn3FPFAdd01], 3, [1], 1>;  // Floating point double add/sub.
916defm : Zn3WriteResXMMPair<WriteFAdd64X, [Zn3FPFAdd01], 3, [1], 1>; // Floating point double add/sub (XMM).
917defm : Zn3WriteResYMMPair<WriteFAdd64Y, [Zn3FPFAdd01], 3, [1], 1>; // Floating point double add/sub (YMM).
918defm : X86WriteResPairUnsupported<WriteFAdd64Z>; // Floating point double add/sub (ZMM).
919defm : Zn3WriteResXMMPair<WriteFCmp, [Zn3FPFMul01], 1, [1], 1>;  // Floating point compare.
920defm : Zn3WriteResXMMPair<WriteFCmpX, [Zn3FPFMul01], 1, [1], 1>; // Floating point compare (XMM).
921defm : Zn3WriteResYMMPair<WriteFCmpY, [Zn3FPFMul01], 1, [1], 1>; // Floating point compare (YMM).
922defm : X86WriteResPairUnsupported<WriteFCmpZ>; // Floating point compare (ZMM).
923defm : Zn3WriteResXMMPair<WriteFCmp64, [Zn3FPFMul01], 1, [1], 1>;  // Floating point double compare.
924defm : Zn3WriteResXMMPair<WriteFCmp64X, [Zn3FPFMul01], 1, [1], 1>; // Floating point double compare (XMM).
925defm : Zn3WriteResYMMPair<WriteFCmp64Y, [Zn3FPFMul01], 1, [1], 1>; // Floating point double compare (YMM).
926defm : X86WriteResPairUnsupported<WriteFCmp64Z>; // Floating point double compare (ZMM).
927defm : Zn3WriteResXMMPair<WriteFCom, [Zn3FPFMul01], 3, [2], 1>; // FIXME: latency not from llvm-exegesis  // Floating point compare to flags (X87).
928defm : Zn3WriteResXMMPair<WriteFComX, [Zn3FPFMul01], 4, [2], 2>;  // FIXME: latency not from llvm-exegesis // Floating point compare to flags (SSE).
929defm : Zn3WriteResXMMPair<WriteFMul, [Zn3FPFMul01], 3, [1], 1>;  // Floating point multiplication.
930defm : Zn3WriteResXMMPair<WriteFMulX, [Zn3FPFMul01], 3, [1], 1>; // Floating point multiplication (XMM).
931defm : Zn3WriteResYMMPair<WriteFMulY, [Zn3FPFMul01], 3, [1], 1>; // Floating point multiplication (YMM).
932defm : X86WriteResPairUnsupported<WriteFMulZ>; // Floating point multiplication (YMM).
933defm : Zn3WriteResXMMPair<WriteFMul64, [Zn3FPFMul01], 3, [1], 1>;  // Floating point double multiplication.
934defm : Zn3WriteResXMMPair<WriteFMul64X, [Zn3FPFMul01], 3, [1], 1>; // Floating point double multiplication (XMM).
935defm : Zn3WriteResYMMPair<WriteFMul64Y, [Zn3FPFMul01], 3, [1], 1>; // Floating point double multiplication (YMM).
936defm : X86WriteResPairUnsupported<WriteFMul64Z>; // Floating point double multiplication (ZMM).
937defm : Zn3WriteResXMMPair<WriteFDiv, [Zn3FPFDiv], 11, [3], 1>;  // Floating point division.
938defm : Zn3WriteResXMMPair<WriteFDivX, [Zn3FPFDiv], 11, [3], 1>; // Floating point division (XMM).
939defm : Zn3WriteResYMMPair<WriteFDivY, [Zn3FPFDiv], 11, [3], 1>; // Floating point division (YMM).
940defm : X86WriteResPairUnsupported<WriteFDivZ>; // Floating point division (ZMM).
941defm : Zn3WriteResXMMPair<WriteFDiv64, [Zn3FPFDiv], 13, [5], 1>;  // Floating point double division.
942defm : Zn3WriteResXMMPair<WriteFDiv64X, [Zn3FPFDiv], 13, [5], 1>; // Floating point double division (XMM).
943defm : Zn3WriteResYMMPair<WriteFDiv64Y, [Zn3FPFDiv], 13, [5], 1>; // Floating point double division (YMM).
944defm : X86WriteResPairUnsupported<WriteFDiv64Z>; // Floating point double division (ZMM).
945defm : Zn3WriteResXMMPair<WriteFSqrt, [Zn3FPFDiv], 15, [5], 1>;   // Floating point square root.
946defm : Zn3WriteResXMMPair<WriteFSqrtX, [Zn3FPFDiv], 15, [5], 1>;  // Floating point square root (XMM).
947defm : Zn3WriteResYMMPair<WriteFSqrtY, [Zn3FPFDiv], 15, [5], 1>;  // Floating point square root (YMM).
948defm : X86WriteResPairUnsupported<WriteFSqrtZ>;  // Floating point square root (ZMM).
949defm : Zn3WriteResXMMPair<WriteFSqrt64, [Zn3FPFDiv], 21, [9], 1>;  // Floating point double square root.
950defm : Zn3WriteResXMMPair<WriteFSqrt64X, [Zn3FPFDiv], 21, [9], 1>; // Floating point double square root (XMM).
951defm : Zn3WriteResYMMPair<WriteFSqrt64Y, [Zn3FPFDiv], 21, [9], 1>; // Floating point double square root (YMM).
952defm : X86WriteResPairUnsupported<WriteFSqrt64Z>; // Floating point double square root (ZMM).
953defm : Zn3WriteResXMMPair<WriteFSqrt80, [Zn3FPFDiv], 22, [23], 1>; // FIXME: latency not from llvm-exegesis  // Floating point long double square root.
954defm : Zn3WriteResXMMPair<WriteFRcp, [Zn3FPFMul01], 3, [1], 1>;  // Floating point reciprocal estimate.
955defm : Zn3WriteResXMMPair<WriteFRcpX, [Zn3FPFMul01], 3, [1], 1>; // Floating point reciprocal estimate (XMM).
956defm : Zn3WriteResYMMPair<WriteFRcpY, [Zn3FPFMul01], 3, [1], 1>; // Floating point reciprocal estimate (YMM).
957defm : X86WriteResPairUnsupported<WriteFRcpZ>; // Floating point reciprocal estimate (ZMM).
958defm : Zn3WriteResXMMPair<WriteFRsqrt, [Zn3FPFDiv], 3, [1], 1>;  // Floating point reciprocal square root estimate.
959defm : Zn3WriteResXMMPair<WriteFRsqrtX, [Zn3FPFDiv], 3, [1], 1>; // Floating point reciprocal square root estimate (XMM).
960defm : Zn3WriteResYMMPair<WriteFRsqrtY, [Zn3FPFDiv], 3, [1], 1>; // Floating point reciprocal square root estimate (YMM).
961defm : X86WriteResPairUnsupported<WriteFRsqrtZ>; // Floating point reciprocal square root estimate (ZMM).
962defm : Zn3WriteResXMMPair<WriteFMA, [Zn3FPFMul01], 4, [2], 1>;  // Fused Multiply Add.
963defm : Zn3WriteResXMMPair<WriteFMAX, [Zn3FPFMul01], 4, [2], 1>; // Fused Multiply Add (XMM).
964defm : Zn3WriteResYMMPair<WriteFMAY, [Zn3FPFMul01], 4, [2], 1>; // Fused Multiply Add (YMM).
965defm : X86WriteResPairUnsupported<WriteFMAZ>; // Fused Multiply Add (ZMM).
966defm : Zn3WriteResXMMPair<WriteDPPD, [Zn3FPFMul01], 9, [6], 3, /*LoadUOps=*/2>; // Floating point double dot product.
967defm : Zn3WriteResXMMPair<WriteDPPS, [Zn3FPFMul01], 15, [8], 8, /*LoadUOps=*/2>; // Floating point single dot product.
968defm : Zn3WriteResYMMPair<WriteDPPSY, [Zn3FPFMul01], 15, [8], 7, /*LoadUOps=*/1>; // Floating point single dot product (YMM).
969defm : X86WriteResPairUnsupported<WriteDPPSZ>; // Floating point single dot product (ZMM).
970defm : Zn3WriteResXMMPair<WriteFSign, [Zn3FPFMul01], 1, [2], 1>; // FIXME: latency not from llvm-exegesis  // Floating point fabs/fchs.
971defm : Zn3WriteResXMMPair<WriteFRnd, [Zn3FPFCvt01], 3, [1], 1>; // Floating point rounding.
972defm : Zn3WriteResYMMPair<WriteFRndY, [Zn3FPFCvt01], 3, [1], 1>; // Floating point rounding (YMM).
973defm : X86WriteResPairUnsupported<WriteFRndZ>; // Floating point rounding (ZMM).
974defm : Zn3WriteResXMMPair<WriteFLogic, [Zn3FPVMisc0123], 1, [1], 1>; // Floating point and/or/xor logicals.
975defm : Zn3WriteResYMMPair<WriteFLogicY, [Zn3FPVMisc0123], 1, [1], 1>; // Floating point and/or/xor logicals (YMM).
976defm : X86WriteResPairUnsupported<WriteFLogicZ>; // Floating point and/or/xor logicals (ZMM).
977defm : Zn3WriteResXMMPair<WriteFTest, [Zn3FPFMisc12], 1, [2], 2>; // FIXME: latency not from llvm-exegesis // Floating point TEST instructions.
978defm : Zn3WriteResYMMPair<WriteFTestY, [Zn3FPFMisc12], 1, [2], 2>; // FIXME: latency not from llvm-exegesis // Floating point TEST instructions (YMM).
979defm : X86WriteResPairUnsupported<WriteFTestZ>; // Floating point TEST instructions (ZMM).
980defm : Zn3WriteResXMMPair<WriteFShuffle, [Zn3FPVShuf01], 1, [1], 1>; // Floating point vector shuffles.
981defm : Zn3WriteResYMMPair<WriteFShuffleY, [Zn3FPVShuf01], 1, [1], 1>; // Floating point vector shuffles (YMM).
982defm : X86WriteResPairUnsupported<WriteFShuffleZ>; // Floating point vector shuffles (ZMM).
983defm : Zn3WriteResXMMPair<WriteFVarShuffle, [Zn3FPVShuf01], 3, [1], 1>; // Floating point vector variable shuffles.
984defm : Zn3WriteResYMMPair<WriteFVarShuffleY, [Zn3FPVShuf01], 3, [1], 1>; // Floating point vector variable shuffles (YMM).
985defm : X86WriteResPairUnsupported<WriteFVarShuffleZ>; // Floating point vector variable shuffles (ZMM).
986defm : Zn3WriteResXMMPair<WriteFBlend, [Zn3FPFMul01], 1, [1], 1>; // Floating point vector blends.
987defm : Zn3WriteResYMMPair<WriteFBlendY, [Zn3FPFMul01], 1, [1], 1>; // Floating point vector blends (YMM).
988defm : X86WriteResPairUnsupported<WriteFBlendZ>; // Floating point vector blends (ZMM).
989defm : Zn3WriteResXMMPair<WriteFVarBlend, [Zn3FPFMul01], 1, [1], 1>; // Fp vector variable blends.
990defm : Zn3WriteResYMMPair<WriteFVarBlendY, [Zn3FPFMul01], 1, [1], 1>; // Fp vector variable blends (YMM).
991defm : X86WriteResPairUnsupported<WriteFVarBlendZ>; // Fp vector variable blends (ZMM).
992
993// Horizontal Add/Sub (float and integer)
994defm : Zn3WriteResXMMPair<WriteFHAdd, [Zn3FPFAdd0], 6, [2], 4>;
995defm : Zn3WriteResYMMPair<WriteFHAddY, [Zn3FPFAdd0], 6, [2], 3, /*LoadUOps=*/1>;
996defm : X86WriteResPairUnsupported<WriteFHAddZ>;
997defm : Zn3WriteResXMMPair<WritePHAdd, [Zn3FPVAdd0], 2, [2], 3, /*LoadUOps=*/1>;
998defm : Zn3WriteResXMMPair<WritePHAddX, [Zn3FPVAdd0], 2, [2], 4>;
999defm : Zn3WriteResYMMPair<WritePHAddY, [Zn3FPVAdd0], 2, [2], 3, /*LoadUOps=*/1>;
1000defm : X86WriteResPairUnsupported<WritePHAddZ>;
1001
1002// Vector integer operations.
1003defm : Zn3WriteResXMM<WriteVecLoad, [Zn3FPLd01, Zn3Load], !add(Znver3Model.VecLoadLatency, 1), [1, 1], 1>;
1004defm : Zn3WriteResXMM<WriteVecLoadX, [Zn3FPLd01, Zn3Load], !add(Znver3Model.VecLoadLatency, 1), [1, 1], 1>;
1005defm : Zn3WriteResYMM<WriteVecLoadY, [Zn3FPLd01, Zn3Load], !add(Znver3Model.VecLoadLatency, 1), [1, 1], 1>;
1006defm : Zn3WriteResXMM<WriteVecLoadNT, [Zn3FPLd01, Zn3Load], !add(Znver3Model.VecLoadLatency, 1), [1, 1], 1>;
1007defm : Zn3WriteResYMM<WriteVecLoadNTY, [Zn3FPLd01, Zn3Load], !add(Znver3Model.VecLoadLatency, 1), [1, 1], 1>;
1008defm : Zn3WriteResXMM<WriteVecMaskedLoad, [Zn3FPLd01, Zn3Load], !add(Znver3Model.VecLoadLatency, 1), [1, 1], 1>;
1009defm : Zn3WriteResYMM<WriteVecMaskedLoadY, [Zn3FPLd01, Zn3Load], !add(Znver3Model.VecLoadLatency, 1), [1, 1], 1>;
1010defm : Zn3WriteResXMM<WriteVecStore, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [1, 1], 1>;
1011defm : Zn3WriteResXMM<WriteVecStoreX, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [1, 1], 1>;
1012
1013def Zn3WriteVEXTRACTF128rr_VEXTRACTI128rr : SchedWriteRes<[Zn3FPFMisc0]> {
1014  let Latency = 4;
1015  let ResourceCycles = [1];
1016  let NumMicroOps = 1;
1017}
1018def : InstRW<[Zn3WriteVEXTRACTF128rr_VEXTRACTI128rr], (instrs VEXTRACTF128rr, VEXTRACTI128rr)>;
1019
1020def Zn3WriteVEXTRACTI128mr : SchedWriteRes<[Zn3FPFMisc0, Zn3FPSt, Zn3Store]> {
1021  let Latency = !add(Znver3Model.LoadLatency, Zn3WriteVEXTRACTF128rr_VEXTRACTI128rr.Latency);
1022  let ResourceCycles = [1, 1, 1];
1023  let NumMicroOps = !add(Zn3WriteVEXTRACTF128rr_VEXTRACTI128rr.NumMicroOps, 1);
1024}
1025def : InstRW<[Zn3WriteVEXTRACTI128mr], (instrs VEXTRACTI128mr, VEXTRACTF128mr)>;
1026
1027def Zn3WriteVINSERTF128rmr : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPFMisc0]> {
1028  let Latency = !add(Znver3Model.LoadLatency, Zn3WriteVEXTRACTF128rr_VEXTRACTI128rr.Latency);
1029  let ResourceCycles = [1, 1, 1];
1030  let NumMicroOps = !add(Zn3WriteVEXTRACTF128rr_VEXTRACTI128rr.NumMicroOps, 0);
1031}
1032def : InstRW<[Zn3WriteVINSERTF128rmr], (instrs VINSERTF128rm)>;
1033
1034defm : Zn3WriteResYMM<WriteVecStoreY, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [1, 1], 1>;
1035defm : Zn3WriteResXMM<WriteVecStoreNT, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [1, 1], 1>;
1036defm : Zn3WriteResYMM<WriteVecStoreNTY, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [1, 1], 1>;
1037defm : Zn3WriteResXMM<WriteVecMaskedStore32, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [6, 1], 18>;
1038defm : Zn3WriteResXMM<WriteVecMaskedStore64, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [4, 1], 10>;
1039defm : Zn3WriteResYMM<WriteVecMaskedStore32Y, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [12, 1], 42>;
1040defm : Zn3WriteResYMM<WriteVecMaskedStore64Y, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [6, 1], 18>;
1041
1042defm : Zn3WriteResXMM<WriteVecMoveToGpr, [Zn3FPLd01], 1, [2], 1>;
1043defm : Zn3WriteResXMM<WriteVecMoveFromGpr, [Zn3FPLd01], 1, [2], 1>;
1044
1045def Zn3WriteMOVMMX : SchedWriteRes<[Zn3FPLd01, Zn3FPFMisc0123]> {
1046  let Latency = 1;
1047  let ResourceCycles = [1, 2];
1048  let NumMicroOps = 2;
1049}
1050def : InstRW<[Zn3WriteMOVMMX], (instrs MMX_MOVQ2FR64rr, MMX_MOVQ2DQrr)>;
1051
1052def Zn3WriteMOVMMXSlow : SchedWriteRes<[Zn3FPLd01, Zn3FPFMisc0123]> {
1053  let Latency = 1;
1054  let ResourceCycles = [1, 4];
1055  let NumMicroOps = 2;
1056}
1057def : InstRW<[Zn3WriteMOVMMXSlow], (instrs MMX_MOVD64rr, MMX_MOVD64to64rr)>;
1058
1059defm : Zn3WriteResXMMPair<WriteVecALU, [Zn3FPVAdd0123], 1, [1], 1>;  // Vector integer ALU op, no logicals.
1060
1061def Zn3WriteEXTRQ_INSERTQ : SchedWriteRes<[Zn3FPVShuf01, Zn3FPLd01]> {
1062  let Latency = 3;
1063  let ResourceCycles = [1, 1];
1064  let NumMicroOps = 1;
1065}
1066def : InstRW<[Zn3WriteEXTRQ_INSERTQ], (instrs EXTRQ, INSERTQ)>;
1067
1068def Zn3WriteEXTRQI_INSERTQI : SchedWriteRes<[Zn3FPVShuf01, Zn3FPLd01]> {
1069  let Latency = 3;
1070  let ResourceCycles = [1, 1];
1071  let NumMicroOps = 2;
1072}
1073def : InstRW<[Zn3WriteEXTRQI_INSERTQI], (instrs EXTRQI, INSERTQI)>;
1074
1075defm : Zn3WriteResXMMPair<WriteVecALUX, [Zn3FPVAdd0123], 1, [1], 1>; // Vector integer ALU op, no logicals (XMM).
1076
1077def Zn3WriteVecALUXSlow : SchedWriteRes<[Zn3FPVAdd01]> {
1078  let Latency = 1;
1079  let ResourceCycles = [1];
1080  let NumMicroOps = 1;
1081}
1082def : InstRW<[Zn3WriteVecALUXSlow], (instrs PABSBrr, PABSDrr, PABSWrr,
1083                                            PADDSBrr, PADDSWrr, PADDUSBrr, PADDUSWrr,
1084                                            PAVGBrr, PAVGWrr,
1085                                            PSIGNBrr, PSIGNDrr, PSIGNWrr,
1086                                            VPABSBrr, VPABSDrr, VPABSWrr,
1087                                            VPADDSBrr, VPADDSWrr, VPADDUSBrr, VPADDUSWrr,
1088                                            VPAVGBrr, VPAVGWrr,
1089                                            VPCMPEQQrr,
1090                                            VPSIGNBrr, VPSIGNDrr, VPSIGNWrr,
1091                                            PSUBSBrr, PSUBSWrr, PSUBUSBrr, PSUBUSWrr, VPSUBSBrr, VPSUBSWrr, VPSUBUSBrr, VPSUBUSWrr)>;
1092
1093def Zn3WriteVecALUXMMX : SchedWriteRes<[Zn3FPVAdd01]> {
1094  let Latency = 1;
1095  let ResourceCycles = [1];
1096  let NumMicroOps = 1;
1097}
1098def : InstRW<[Zn3WriteVecALUXMMX], (instrs MMX_PABSBrr, MMX_PABSDrr, MMX_PABSWrr,
1099                                           MMX_PSIGNBrr, MMX_PSIGNDrr, MMX_PSIGNWrr,
1100                                           MMX_PADDSBirr, MMX_PADDSWirr, MMX_PADDUSBirr, MMX_PADDUSWirr,
1101                                           MMX_PAVGBirr, MMX_PAVGWirr,
1102                                           MMX_PSUBSBirr, MMX_PSUBSWirr, MMX_PSUBUSBirr, MMX_PSUBUSWirr)>;
1103
1104defm : Zn3WriteResYMMPair<WriteVecALUY, [Zn3FPVAdd0123], 1, [1], 1>; // Vector integer ALU op, no logicals (YMM).
1105
1106def Zn3WriteVecALUYSlow : SchedWriteRes<[Zn3FPVAdd01]> {
1107  let Latency = 1;
1108  let ResourceCycles = [1];
1109  let NumMicroOps = 1;
1110}
1111def : InstRW<[Zn3WriteVecALUYSlow], (instrs VPABSBYrr, VPABSDYrr, VPABSWYrr,
1112                                            VPADDSBYrr, VPADDSWYrr, VPADDUSBYrr, VPADDUSWYrr,
1113                                            VPSUBSBYrr, VPSUBSWYrr, VPSUBUSBYrr, VPSUBUSWYrr,
1114                                            VPAVGBYrr, VPAVGWYrr,
1115                                            VPCMPEQQYrr,
1116                                            VPSIGNBYrr, VPSIGNDYrr, VPSIGNWYrr)>;
1117
1118defm : X86WriteResPairUnsupported<WriteVecALUZ>; // Vector integer ALU op, no logicals (ZMM).
1119defm : Zn3WriteResXMMPair<WriteVecLogic, [Zn3FPVMisc0123], 1, [1], 1>;  // Vector integer and/or/xor logicals.
1120defm : Zn3WriteResXMMPair<WriteVecLogicX, [Zn3FPVMisc0123], 1, [1], 1>; // Vector integer and/or/xor logicals (XMM).
1121defm : Zn3WriteResYMMPair<WriteVecLogicY, [Zn3FPVMisc0123], 1, [1], 1>; // Vector integer and/or/xor logicals (YMM).
1122defm : X86WriteResPairUnsupported<WriteVecLogicZ>; // Vector integer and/or/xor logicals (ZMM).
1123defm : Zn3WriteResXMMPair<WriteVecTest, [Zn3FPVAdd12, Zn3FPSt], 1, [1, 1], 2>;  // FIXME: latency not from llvm-exegesis // Vector integer TEST instructions.
1124defm : Zn3WriteResYMMPair<WriteVecTestY, [Zn3FPVAdd12, Zn3FPSt], 1, [1, 1], 2>; // FIXME: latency not from llvm-exegesis  // Vector integer TEST instructions (YMM).
1125defm : X86WriteResPairUnsupported<WriteVecTestZ>;  // Vector integer TEST instructions (ZMM).
1126defm : Zn3WriteResXMMPair<WriteVecShift, [Zn3FPVShift01], 1, [1], 1>;  // Vector integer shifts (default).
1127defm : Zn3WriteResXMMPair<WriteVecShiftX, [Zn3FPVShift01], 1, [1], 1>; // Vector integer shifts (XMM).
1128defm : Zn3WriteResYMMPair<WriteVecShiftY, [Zn3FPVShift01], 1, [1], 1>; // Vector integer shifts (YMM).
1129defm : X86WriteResPairUnsupported<WriteVecShiftZ>; // Vector integer shifts (ZMM).
1130defm : Zn3WriteResXMMPair<WriteVecShiftImm, [Zn3FPVShift01], 1, [1], 1>;  // Vector integer immediate shifts (default).
1131defm : Zn3WriteResXMMPair<WriteVecShiftImmX, [Zn3FPVShift01], 1, [1], 1>; // Vector integer immediate shifts (XMM).
1132defm : Zn3WriteResYMMPair<WriteVecShiftImmY, [Zn3FPVShift01], 1, [1], 1>; // Vector integer immediate shifts (YMM).
1133defm : X86WriteResPairUnsupported<WriteVecShiftImmZ>; // Vector integer immediate shifts (ZMM).
1134defm : Zn3WriteResXMMPair<WriteVecIMul, [Zn3FPVMul01], 3, [1], 1>;  // Vector integer multiply (default).
1135defm : Zn3WriteResXMMPair<WriteVecIMulX, [Zn3FPVMul01], 3, [1], 1>; // Vector integer multiply (XMM).
1136defm : Zn3WriteResYMMPair<WriteVecIMulY, [Zn3FPVMul01], 3, [1], 1>; // Vector integer multiply (YMM).
1137defm : X86WriteResPairUnsupported<WriteVecIMulZ>; // Vector integer multiply (ZMM).
1138defm : Zn3WriteResXMMPair<WritePMULLD, [Zn3FPVMul01], 3, [1], 1>; // Vector PMULLD.
1139defm : Zn3WriteResYMMPair<WritePMULLDY, [Zn3FPVMul01], 3, [1], 1>; // Vector PMULLD (YMM).
1140defm : X86WriteResPairUnsupported<WritePMULLDZ>; // Vector PMULLD (ZMM).
1141defm : Zn3WriteResXMMPair<WriteShuffle, [Zn3FPVShuf01], 1, [1], 1>;  // Vector shuffles.
1142defm : Zn3WriteResXMMPair<WriteShuffleX, [Zn3FPVShuf01], 1, [1], 1>; // Vector shuffles (XMM).
1143defm : Zn3WriteResYMMPair<WriteShuffleY, [Zn3FPVShuf01], 1, [1], 1>; // Vector shuffles (YMM).
1144defm : X86WriteResPairUnsupported<WriteShuffleZ>; // Vector shuffles (ZMM).
1145defm : Zn3WriteResXMMPair<WriteVarShuffle, [Zn3FPVShift01], 1, [1], 1>;  // Vector variable shuffles.
1146defm : Zn3WriteResXMMPair<WriteVarShuffleX, [Zn3FPVShift01], 1, [1], 1>; // Vector variable shuffles (XMM).
1147defm : Zn3WriteResYMMPair<WriteVarShuffleY, [Zn3FPVShift01], 1, [1], 1>; // Vector variable shuffles (YMM).
1148defm : X86WriteResPairUnsupported<WriteVarShuffleZ>; // Vector variable shuffles (ZMM).
1149defm : Zn3WriteResXMMPair<WriteBlend, [Zn3FPVMisc0123], 1, [1], 1>; // Vector blends.
1150defm : Zn3WriteResYMMPair<WriteBlendY, [Zn3FPVMisc0123], 1, [1], 1>; // Vector blends (YMM).
1151defm : X86WriteResPairUnsupported<WriteBlendZ>; // Vector blends (ZMM).
1152defm : Zn3WriteResXMMPair<WriteVarBlend, [Zn3FPVMul01], 1, [1], 1>; // Vector variable blends.
1153defm : Zn3WriteResYMMPair<WriteVarBlendY, [Zn3FPVMul01], 1, [1], 1>; // Vector variable blends (YMM).
1154defm : X86WriteResPairUnsupported<WriteVarBlendZ>; // Vector variable blends (ZMM).
1155defm : Zn3WriteResXMMPair<WritePSADBW, [Zn3FPVAdd0123], 3, [2], 1>;  // Vector PSADBW.
1156defm : Zn3WriteResXMMPair<WritePSADBWX, [Zn3FPVAdd0123], 3, [2], 1>; // Vector PSADBW (XMM).
1157defm : Zn3WriteResYMMPair<WritePSADBWY, [Zn3FPVAdd0123], 3, [2], 1>; // Vector PSADBW (YMM).
1158defm : X86WriteResPairUnsupported<WritePSADBWZ>; // Vector PSADBW (ZMM).
1159defm : Zn3WriteResXMMPair<WriteMPSAD, [Zn3FPVAdd0123], 4, [8], 4, /*LoadUOps=*/2>; // Vector MPSAD.
1160defm : Zn3WriteResYMMPair<WriteMPSADY, [Zn3FPVAdd0123], 4, [8], 3, /*LoadUOps=*/1>; // Vector MPSAD (YMM).
1161defm : X86WriteResPairUnsupported<WriteMPSADZ>; // Vector MPSAD (ZMM).
1162defm : Zn3WriteResXMMPair<WritePHMINPOS, [Zn3FPVAdd01], 3, [1], 1>;  // Vector PHMINPOS.
1163
1164// Vector insert/extract operations.
1165defm : Zn3WriteResXMMPair<WriteVecInsert, [Zn3FPLd01], 1, [2], 2, /*LoadUOps=*/-1>; // Insert gpr to vector element.
1166defm : Zn3WriteResXMM<WriteVecExtract, [Zn3FPLd01], 1, [2], 2>; // Extract vector element to gpr.
1167defm : Zn3WriteResXMM<WriteVecExtractSt, [Zn3FPSt, Zn3Store], !add(1, Znver3Model.StoreLatency), [1, 1], 2>; // Extract vector element and store.
1168
1169// MOVMSK operations.
1170defm : Zn3WriteResXMM<WriteFMOVMSK, [Zn3FPVMisc2], 1, [1], 1>;
1171defm : Zn3WriteResXMM<WriteVecMOVMSK, [Zn3FPVMisc2], 1, [1], 1>;
1172defm : Zn3WriteResYMM<WriteVecMOVMSKY, [Zn3FPVMisc2], 1, [1], 1>;
1173defm : Zn3WriteResXMM<WriteMMXMOVMSK, [Zn3FPVMisc2], 1, [1], 1>;
1174
1175// Conversion between integer and float.
1176defm : Zn3WriteResXMMPair<WriteCvtSD2I, [Zn3FPFCvt01], 2, [2], 2>;  // Double -> Integer.
1177defm : Zn3WriteResXMMPair<WriteCvtPD2I, [Zn3FPFCvt01], 3, [1], 1>; // Double -> Integer (XMM).
1178defm : Zn3WriteResYMMPair<WriteCvtPD2IY, [Zn3FPFCvt01], 6, [2], 2>; // Double -> Integer (YMM).
1179defm : X86WriteResPairUnsupported<WriteCvtPD2IZ>; // Double -> Integer (ZMM).
1180
1181def Zn3WriteCvtPD2IMMX : SchedWriteRes<[Zn3FPFCvt01]> {
1182  let Latency = 1;
1183  let ResourceCycles = [2];
1184  let NumMicroOps = 2;
1185}
1186def : InstRW<[Zn3WriteCvtPD2IMMX], (instrs MMX_CVTPD2PIirm, MMX_CVTTPD2PIirm, MMX_CVTPD2PIirr, MMX_CVTTPD2PIirr)>;
1187
1188defm : Zn3WriteResXMMPair<WriteCvtSS2I, [Zn3FPFCvt01], 2, [2], 2>;  // Float -> Integer.
1189
1190defm : Zn3WriteResXMMPair<WriteCvtPS2I, [Zn3FPFCvt01], 3, [1], 1>; // Float -> Integer (XMM).
1191defm : Zn3WriteResYMMPair<WriteCvtPS2IY, [Zn3FPFCvt01], 3, [1], 1>; // Float -> Integer (YMM).
1192defm : X86WriteResPairUnsupported<WriteCvtPS2IZ>; // Float -> Integer (ZMM).
1193
1194defm : Zn3WriteResXMMPair<WriteCvtI2SD, [Zn3FPFCvt01], 3, [2], 2, /*LoadUOps=*/-1>;  // Integer -> Double.
1195defm : Zn3WriteResXMMPair<WriteCvtI2PD, [Zn3FPFCvt01], 3, [1], 1>; // Integer -> Double (XMM).
1196defm : Zn3WriteResYMMPair<WriteCvtI2PDY, [Zn3FPFCvt01], 4, [2], 2, /*LoadUOps=*/-1>; // Integer -> Double (YMM).
1197defm : X86WriteResPairUnsupported<WriteCvtI2PDZ>; // Integer -> Double (ZMM).
1198
1199def Zn3WriteCvtI2PDMMX : SchedWriteRes<[Zn3FPFCvt01]> {
1200  let Latency = 2;
1201  let ResourceCycles = [6];
1202  let NumMicroOps = 2;
1203}
1204def : InstRW<[Zn3WriteCvtI2PDMMX], (instrs MMX_CVTPI2PDirm, MMX_CVTPI2PDirr)>;
1205
1206defm : Zn3WriteResXMMPair<WriteCvtI2SS, [Zn3FPFCvt01], 3, [2], 2, /*LoadUOps=*/-1>;  // Integer -> Float.
1207defm : Zn3WriteResXMMPair<WriteCvtI2PS, [Zn3FPFCvt01], 3, [1], 1>; // Integer -> Float (XMM).
1208defm : Zn3WriteResYMMPair<WriteCvtI2PSY, [Zn3FPFCvt01], 3, [1], 1>; // Integer -> Float (YMM).
1209defm : X86WriteResPairUnsupported<WriteCvtI2PSZ>; // Integer -> Float (ZMM).
1210
1211def Zn3WriteCvtI2PSMMX : SchedWriteRes<[Zn3FPFCvt01]> {
1212  let Latency = 3;
1213  let ResourceCycles = [1];
1214  let NumMicroOps = 2;
1215}
1216def : InstRW<[Zn3WriteCvtI2PSMMX], (instrs MMX_CVTPI2PSirr)>;
1217
1218defm : Zn3WriteResXMMPair<WriteCvtSS2SD, [Zn3FPFCvt01], 3, [1], 1>;  // Float -> Double size conversion.
1219defm : Zn3WriteResXMMPair<WriteCvtPS2PD, [Zn3FPFCvt01], 3, [1], 1>; // Float -> Double size conversion (XMM).
1220defm : Zn3WriteResYMMPair<WriteCvtPS2PDY, [Zn3FPFCvt01], 4, [2], 2, /*LoadUOps=*/-1>; // Float -> Double size conversion (YMM).
1221defm : X86WriteResPairUnsupported<WriteCvtPS2PDZ>; // Float -> Double size conversion (ZMM).
1222
1223defm : Zn3WriteResXMMPair<WriteCvtSD2SS, [Zn3FPFCvt01], 3, [1], 1>;  // Double -> Float size conversion.
1224defm : Zn3WriteResXMMPair<WriteCvtPD2PS, [Zn3FPFCvt01], 3, [1], 1>; // Double -> Float size conversion (XMM).
1225defm : Zn3WriteResYMMPair<WriteCvtPD2PSY, [Zn3FPFCvt01], 6, [2], 2>; // Double -> Float size conversion (YMM).
1226defm : X86WriteResPairUnsupported<WriteCvtPD2PSZ>; // Double -> Float size conversion (ZMM).
1227
1228defm : Zn3WriteResXMMPair<WriteCvtPH2PS, [Zn3FPFCvt01], 3, [1], 1>; // Half -> Float size conversion.
1229defm : Zn3WriteResYMMPair<WriteCvtPH2PSY, [Zn3FPFCvt01], 4, [2], 2, /*LoadUOps=*/-1>; // Half -> Float size conversion (YMM).
1230defm : X86WriteResPairUnsupported<WriteCvtPH2PSZ>; // Half -> Float size conversion (ZMM).
1231
1232defm : Zn3WriteResXMM<WriteCvtPS2PH, [Zn3FPFCvt01], 3, [2], 1>; // Float -> Half size conversion.
1233defm : Zn3WriteResYMM<WriteCvtPS2PHY, [Zn3FPFCvt01], 6, [2], 2>; // Float -> Half size conversion (YMM).
1234defm : X86WriteResUnsupported<WriteCvtPS2PHZ>; // Float -> Half size conversion (ZMM).
1235defm : Zn3WriteResXMM<WriteCvtPS2PHSt, [Zn3FPFCvt01, Zn3FPSt, Zn3Store], !add(3, Znver3Model.StoreLatency), [1, 1, 1], 2>; // Float -> Half + store size conversion.
1236defm : Zn3WriteResYMM<WriteCvtPS2PHYSt, [Zn3FPFCvt01, Zn3FPSt, Zn3Store], !add(6, Znver3Model.StoreLatency), [2, 1, 1], 3>; // Float -> Half + store size conversion (YMM).
1237defm : X86WriteResUnsupported<WriteCvtPS2PHZSt>; // Float -> Half + store size conversion (ZMM).
1238
1239// CRC32 instruction.
1240defm : Zn3WriteResIntPair<WriteCRC32, [Zn3ALU1], 3, [1], 1>;
1241
1242def Zn3WriteSHA1MSG1rr : SchedWriteRes<[Zn3FPU0123]> {
1243  let Latency = 2;
1244  let ResourceCycles = [2];
1245  let NumMicroOps = 2;
1246}
1247def : InstRW<[Zn3WriteSHA1MSG1rr], (instrs SHA1MSG1rr)>;
1248
1249def Zn3WriteSHA1MSG1rm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPU0123]> {
1250  let Latency = !add(Znver3Model.LoadLatency, Zn3WriteSHA1MSG1rr.Latency);
1251  let ResourceCycles = [1, 1, 2];
1252  let NumMicroOps = !add(Zn3WriteSHA1MSG1rr.NumMicroOps, 0);
1253}
1254def : InstRW<[Zn3WriteSHA1MSG1rm], (instrs SHA1MSG1rm)>;
1255
1256def Zn3WriteSHA1MSG2rr_SHA1NEXTErr : SchedWriteRes<[Zn3FPU0123]> {
1257  let Latency = 1;
1258  let ResourceCycles = [2];
1259  let NumMicroOps = 1;
1260}
1261def : InstRW<[Zn3WriteSHA1MSG2rr_SHA1NEXTErr], (instrs SHA1MSG2rr, SHA1NEXTErr)>;
1262
1263def Zn3Writerm_SHA1MSG2rm_SHA1NEXTErm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPU0123]> {
1264  let Latency = !add(Znver3Model.LoadLatency, Zn3WriteSHA1MSG2rr_SHA1NEXTErr.Latency);
1265  let ResourceCycles = [1, 1, 2];
1266  let NumMicroOps = !add(Zn3WriteSHA1MSG2rr_SHA1NEXTErr.NumMicroOps, 0);
1267}
1268def : InstRW<[Zn3Writerm_SHA1MSG2rm_SHA1NEXTErm], (instrs SHA1MSG2rm, SHA1NEXTErm)>;
1269
1270def Zn3WriteSHA256MSG1rr : SchedWriteRes<[Zn3FPU0123]> {
1271  let Latency = 2;
1272  let ResourceCycles = [3];
1273  let NumMicroOps = 2;
1274}
1275def : InstRW<[Zn3WriteSHA256MSG1rr], (instrs SHA256MSG1rr)>;
1276
1277def Zn3Writerm_SHA256MSG1rm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPU0123]> {
1278  let Latency = !add(Znver3Model.LoadLatency, Zn3WriteSHA256MSG1rr.Latency);
1279  let ResourceCycles = [1, 1, 3];
1280  let NumMicroOps = !add(Zn3WriteSHA256MSG1rr.NumMicroOps, 0);
1281}
1282def : InstRW<[Zn3Writerm_SHA256MSG1rm], (instrs SHA256MSG1rm)>;
1283
1284def Zn3WriteSHA256MSG2rr : SchedWriteRes<[Zn3FPU0123]> {
1285  let Latency = 3;
1286  let ResourceCycles = [8];
1287  let NumMicroOps = 4;
1288}
1289def : InstRW<[Zn3WriteSHA256MSG2rr], (instrs SHA256MSG2rr)>;
1290
1291def Zn3WriteSHA256MSG2rm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPU0123]> {
1292  let Latency = !add(Znver3Model.LoadLatency, Zn3WriteSHA256MSG2rr.Latency);
1293  let ResourceCycles = [1, 1, 8];
1294  let NumMicroOps = !add(Zn3WriteSHA256MSG2rr.NumMicroOps, 1);
1295}
1296def : InstRW<[Zn3WriteSHA256MSG2rm], (instrs SHA256MSG2rm)>;
1297
1298def Zn3WriteSHA1RNDS4rri : SchedWriteRes<[Zn3FPU0123]> {
1299  let Latency = 6;
1300  let ResourceCycles = [8];
1301  let NumMicroOps = 1;
1302}
1303def : InstRW<[Zn3WriteSHA1RNDS4rri], (instrs SHA1RNDS4rri)>;
1304
1305def Zn3WriteSHA256RNDS2rr : SchedWriteRes<[Zn3FPU0123]> {
1306  let Latency = 4;
1307  let ResourceCycles = [8];
1308  let NumMicroOps = 1;
1309}
1310def : InstRW<[Zn3WriteSHA256RNDS2rr], (instrs SHA256RNDS2rr)>;
1311
1312// Strings instructions.
1313// Packed Compare Implicit Length Strings, Return Mask
1314defm : Zn3WriteResXMMPair<WritePCmpIStrM, [Zn3FPVAdd0123], 6, [8], 3, /*LoadUOps=*/1>;
1315// Packed Compare Explicit Length Strings, Return Mask
1316defm : Zn3WriteResXMMPair<WritePCmpEStrM, [Zn3FPVAdd0123], 6, [12], 7, /*LoadUOps=*/5>;
1317// Packed Compare Implicit Length Strings, Return Index
1318defm : Zn3WriteResXMMPair<WritePCmpIStrI, [Zn3FPVAdd0123], 2, [8], 4>;
1319// Packed Compare Explicit Length Strings, Return Index
1320defm : Zn3WriteResXMMPair<WritePCmpEStrI, [Zn3FPVAdd0123], 6, [12], 8, /*LoadUOps=*/4>;
1321
1322// AES instructions.
1323defm : Zn3WriteResXMMPair<WriteAESDecEnc, [Zn3FPAES01], 4, [1], 1>; // Decryption, encryption.
1324defm : Zn3WriteResXMMPair<WriteAESIMC, [Zn3FPAES01], 4, [1], 1>; // InvMixColumn.
1325defm : Zn3WriteResXMMPair<WriteAESKeyGen, [Zn3FPAES01], 4, [1], 1>; // Key Generation.
1326
1327// Carry-less multiplication instructions.
1328defm : Zn3WriteResXMMPair<WriteCLMul, [Zn3FPCLM01], 4, [4], 4>;
1329
1330// EMMS/FEMMS
1331defm : Zn3WriteResInt<WriteEMMS, [Zn3ALU0123], 2, [1], 1>; // FIXME: latency not from llvm-exegesis
1332
1333// Load/store MXCSR
1334defm : Zn3WriteResInt<WriteLDMXCSR, [Zn3AGU012, Zn3Load, Zn3ALU0123], !add(Znver3Model.LoadLatency, 1), [1, 1, 6], 1>; // FIXME: latency not from llvm-exegesis
1335defm : Zn3WriteResInt<WriteSTMXCSR, [Zn3ALU0123, Zn3AGU012, Zn3Store], !add(1, Znver3Model.StoreLatency), [60, 1, 1], 2>; // FIXME: latency not from llvm-exegesis
1336
1337// Catch-all for expensive system instructions.
1338defm : Zn3WriteResInt<WriteSystem, [Zn3ALU0123], 100, [100], 100>;
1339
1340def Zn3WriteVZEROUPPER : SchedWriteRes<[Zn3FPU0123]> {
1341  let Latency = 0; // FIXME: not from llvm-exegesis
1342  let ResourceCycles = [1];
1343  let NumMicroOps = 1;
1344}
1345def : InstRW<[Zn3WriteVZEROUPPER], (instrs VZEROUPPER)>;
1346
1347def Zn3WriteVZEROALL : SchedWriteRes<[Zn3FPU0123]> {
1348  let Latency = 10; // FIXME: not from llvm-exegesis
1349  let ResourceCycles = [24];
1350  let NumMicroOps = 18;
1351}
1352def : InstRW<[Zn3WriteVZEROALL], (instrs VZEROALL)>;
1353
1354// AVX2.
1355defm : Zn3WriteResYMMPair<WriteFShuffle256, [Zn3FPVShuf], 2, [1], 1, /*LoadUOps=*/2>; // Fp 256-bit width vector shuffles.
1356defm : Zn3WriteResYMMPair<WriteFVarShuffle256, [Zn3FPVShuf], 7, [1], 2, /*LoadUOps=*/1>; // Fp 256-bit width variable shuffles.
1357defm : Zn3WriteResYMMPair<WriteShuffle256, [Zn3FPVShuf], 2, [1], 1>; // 256-bit width vector shuffles.
1358
1359def Zn3WriteVPERM2I128rr_VPERM2F128rr : SchedWriteRes<[Zn3FPVShuf]> {
1360  let Latency = 3;
1361  let ResourceCycles = [1];
1362  let NumMicroOps = 1;
1363}
1364def : InstRW<[Zn3WriteVPERM2I128rr_VPERM2F128rr], (instrs VPERM2I128rr, VPERM2F128rr)>;
1365
1366def Zn3WriteVPERM2F128rm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPVShuf]> {
1367  let Latency = !add(Znver3Model.LoadLatency, Zn3WriteVPERM2I128rr_VPERM2F128rr.Latency);
1368  let ResourceCycles = [1, 1, 1];
1369  let NumMicroOps = !add(Zn3WriteVPERM2I128rr_VPERM2F128rr.NumMicroOps, 0);
1370}
1371def : InstRW<[Zn3WriteVPERM2F128rm], (instrs VPERM2F128rm)>;
1372
1373def Zn3WriteVPERMPSYrr : SchedWriteRes<[Zn3FPVShuf]> {
1374  let Latency = 7;
1375  let ResourceCycles = [1];
1376  let NumMicroOps = 2;
1377}
1378def : InstRW<[Zn3WriteVPERMPSYrr], (instrs VPERMPSYrr)>;
1379
1380def Zn3WriteVPERMPSYrm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPVShuf]> {
1381  let Latency = !add(Znver3Model.LoadLatency, Zn3WriteVPERMPSYrr.Latency);
1382  let ResourceCycles = [1, 1, 2];
1383  let NumMicroOps = !add(Zn3WriteVPERMPSYrr.NumMicroOps, 1);
1384}
1385def : InstRW<[Zn3WriteVPERMPSYrm], (instrs VPERMPSYrm)>;
1386
1387def Zn3WriteVPERMYri : SchedWriteRes<[Zn3FPVShuf]> {
1388  let Latency = 6;
1389  let ResourceCycles = [1];
1390  let NumMicroOps = 2;
1391}
1392def : InstRW<[Zn3WriteVPERMYri], (instrs VPERMPDYri, VPERMQYri)>;
1393
1394def Zn3WriteVPERMPDYmi : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPVShuf]> {
1395  let Latency = !add(Znver3Model.LoadLatency, Zn3WriteVPERMYri.Latency);
1396  let ResourceCycles = [1, 1, 2];
1397  let NumMicroOps = !add(Zn3WriteVPERMYri.NumMicroOps, 1);
1398}
1399def : InstRW<[Zn3WriteVPERMPDYmi], (instrs VPERMPDYmi)>;
1400
1401def Zn3WriteVPERMDYrr : SchedWriteRes<[Zn3FPVShuf]> {
1402  let Latency = 5;
1403  let ResourceCycles = [1];
1404  let NumMicroOps = 2;
1405}
1406def : InstRW<[Zn3WriteVPERMDYrr], (instrs VPERMDYrr)>;
1407
1408def Zn3WriteVPERMYm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPVShuf]> {
1409  let Latency = !add(Znver3Model.LoadLatency, Zn3WriteVPERMDYrr.Latency);
1410  let ResourceCycles = [1, 1, 2];
1411  let NumMicroOps = !add(Zn3WriteVPERMDYrr.NumMicroOps, 0);
1412}
1413def : InstRW<[Zn3WriteVPERMYm], (instrs VPERMQYmi, VPERMDYrm)>;
1414
1415defm : Zn3WriteResYMMPair<WriteVPMOV256, [Zn3FPVShuf01], 4, [3], 2, /*LoadUOps=*/-1>; // 256-bit width packed vector width-changing move.
1416defm : Zn3WriteResYMMPair<WriteVarShuffle256, [Zn3FPVShift01], 1, [1], 2>; // 256-bit width vector variable shuffles.
1417defm : Zn3WriteResXMMPair<WriteVarVecShift, [Zn3FPVShift01], 1, [1], 1>; // Variable vector shifts.
1418defm : Zn3WriteResYMMPair<WriteVarVecShiftY, [Zn3FPVShift01], 1, [1], 1>; // Variable vector shifts (YMM).
1419defm : X86WriteResPairUnsupported<WriteVarVecShiftZ>; // Variable vector shifts (ZMM).
1420
1421// Old microcoded instructions that nobody use.
1422defm : Zn3WriteResInt<WriteMicrocoded, [Zn3ALU0123], 100, [100], 100>;
1423
1424// Fence instructions.
1425defm : Zn3WriteResInt<WriteFence, [Zn3ALU0123], 1, [100], 1>;
1426
1427def Zn3WriteLFENCE : SchedWriteRes<[Zn3LSU]> {
1428  let Latency = 1;
1429  let ResourceCycles = [30];
1430  let NumMicroOps = 1;
1431}
1432def : InstRW<[Zn3WriteLFENCE], (instrs LFENCE)>;
1433
1434def Zn3WriteSFENCE : SchedWriteRes<[Zn3LSU]> {
1435  let Latency = 1;
1436  let ResourceCycles = [1];
1437  let NumMicroOps = 1;
1438}
1439def : InstRW<[Zn3WriteSFENCE], (instrs SFENCE)>;
1440
1441// Nop, not very useful expect it provides a model for nops!
1442defm : Zn3WriteResInt<WriteNop, [Zn3ALU0123], 0, [1], 1>; // FIXME: latency not from llvm-exegesis
1443
1444
1445///////////////////////////////////////////////////////////////////////////////
1446// Zero Cycle Move
1447///////////////////////////////////////////////////////////////////////////////
1448
1449def Zn3WriteZeroLatency : SchedWriteRes<[]> {
1450  let Latency = 0;
1451  let ResourceCycles = [];
1452  let NumMicroOps = 1;
1453}
1454def : InstRW<[Zn3WriteZeroLatency], (instrs MOV32rr, MOV32rr_REV,
1455                                               MOV64rr, MOV64rr_REV,
1456                                               MOVSX32rr32)>;
1457
1458def Zn3WriteSwapRenameable : SchedWriteRes<[]> {
1459  let Latency = 0;
1460  let ResourceCycles = [];
1461  let NumMicroOps = 2;
1462}
1463def : InstRW<[Zn3WriteSwapRenameable], (instrs XCHG32rr, XCHG32ar,
1464                                               XCHG64rr, XCHG64ar)>;
1465
1466defm : Zn3WriteResInt<WriteXCHG, [Zn3ALU0123], 0, [8], 2>;        // Compare+Exchange - TODO RMW support.
1467
1468defm : Zn3WriteResXMM<WriteFMove, [Zn3FPVMisc0123], 1, [1], 1>; // Empty sched class
1469defm : Zn3WriteResXMM<WriteFMoveX, [], 0, [], 1>;
1470defm : Zn3WriteResYMM<WriteFMoveY, [], 0, [], 1>;
1471
1472defm : Zn3WriteResXMM<WriteVecMove, [Zn3FPFMisc0123], 1, [1], 1>; // MMX
1473defm : Zn3WriteResXMM<WriteVecMoveX, [], 0, [], 1>;
1474defm : Zn3WriteResYMM<WriteVecMoveY, [], 0, [], 1>;
1475
1476def : IsOptimizableRegisterMove<[
1477  InstructionEquivalenceClass<[
1478    // GPR variants.
1479    MOV32rr, MOV32rr_REV,
1480    MOV64rr, MOV64rr_REV,
1481    MOVSX32rr32,
1482    XCHG32rr, XCHG32ar,
1483    XCHG64rr, XCHG64ar,
1484
1485    // MMX variants.
1486    // MMX moves are *NOT* eliminated.
1487
1488    // SSE variants.
1489    MOVAPSrr, MOVAPSrr_REV,
1490    MOVUPSrr, MOVUPSrr_REV,
1491    MOVAPDrr, MOVAPDrr_REV,
1492    MOVUPDrr, MOVUPDrr_REV,
1493    MOVDQArr, MOVDQArr_REV,
1494    MOVDQUrr, MOVDQUrr_REV,
1495
1496    // AVX variants.
1497    VMOVAPSrr, VMOVAPSrr_REV,
1498    VMOVUPSrr, VMOVUPSrr_REV,
1499    VMOVAPDrr, VMOVAPDrr_REV,
1500    VMOVUPDrr, VMOVUPDrr_REV,
1501    VMOVDQArr, VMOVDQArr_REV,
1502    VMOVDQUrr, VMOVDQUrr_REV,
1503
1504    // AVX YMM variants.
1505    VMOVAPSYrr, VMOVAPSYrr_REV,
1506    VMOVUPSYrr, VMOVUPSYrr_REV,
1507    VMOVAPDYrr, VMOVAPDYrr_REV,
1508    VMOVUPDYrr, VMOVUPDYrr_REV,
1509    VMOVDQAYrr, VMOVDQAYrr_REV,
1510    VMOVDQUYrr, VMOVDQUYrr_REV,
1511  ], TruePred >
1512]>;
1513
1514///////////////////////////////////////////////////////////////////////////////
1515// Dependency breaking instructions.
1516///////////////////////////////////////////////////////////////////////////////
1517
1518def Zn3WriteZeroIdiom : SchedWriteVariant<[
1519    SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn3WriteZeroLatency]>,
1520    SchedVar<NoSchedPred,                          [WriteALU]>
1521]>;
1522def : InstRW<[Zn3WriteZeroIdiom], (instrs XOR32rr, XOR32rr_REV,
1523                                          XOR64rr, XOR64rr_REV,
1524                                          SUB32rr, SUB32rr_REV,
1525                                          SUB64rr, SUB64rr_REV)>;
1526
1527def Zn3WriteZeroIdiomEFLAGS : SchedWriteVariant<[
1528    SchedVar<MCSchedPredicate<CheckSameRegOperand<0, 1>>, [Zn3WriteZeroLatency]>,
1529    SchedVar<NoSchedPred,                                 [WriteALU]>
1530]>;
1531def : InstRW<[Zn3WriteZeroIdiomEFLAGS], (instrs CMP8rr,  CMP8rr_REV,
1532                                                CMP16rr, CMP16rr_REV,
1533                                                CMP32rr, CMP32rr_REV,
1534                                                CMP64rr, CMP64rr_REV)>;
1535
1536def Zn3WriteFZeroIdiom : SchedWriteVariant<[
1537    SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn3WriteZeroLatency]>,
1538    SchedVar<NoSchedPred,                          [WriteFLogic]>
1539]>;
1540// NOTE: XORPSrr, XORPDrr are not zero-cycle!
1541def : InstRW<[Zn3WriteFZeroIdiom], (instrs VXORPSrr, VXORPDrr,
1542                                           VANDNPSrr, VANDNPDrr)>;
1543
1544def Zn3WriteFZeroIdiomY : SchedWriteVariant<[
1545    SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn3WriteZeroLatency]>,
1546    SchedVar<NoSchedPred,                          [WriteFLogicY]>
1547]>;
1548def : InstRW<[Zn3WriteFZeroIdiomY], (instrs VXORPSYrr, VXORPDYrr,
1549                                            VANDNPSYrr, VANDNPDYrr)>;
1550
1551def Zn3WriteVZeroIdiomLogicX : SchedWriteVariant<[
1552    SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn3WriteZeroLatency]>,
1553    SchedVar<NoSchedPred,                          [WriteVecLogicX]>
1554]>;
1555// NOTE: PXORrr,PANDNrr are not zero-cycle!
1556def : InstRW<[Zn3WriteVZeroIdiomLogicX], (instrs VPXORrr, VPANDNrr)>;
1557
1558def Zn3WriteVZeroIdiomLogicY : SchedWriteVariant<[
1559    SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn3WriteZeroLatency]>,
1560    SchedVar<NoSchedPred,                          [WriteVecLogicY]>
1561]>;
1562def : InstRW<[Zn3WriteVZeroIdiomLogicY], (instrs VPXORYrr, VPANDNYrr)>;
1563
1564def Zn3WriteVZeroIdiomALUX : SchedWriteVariant<[
1565    SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn3WriteZeroLatency]>,
1566    SchedVar<NoSchedPred,                          [WriteVecALUX]>
1567]>;
1568// NOTE: PSUBBrr, PSUBWrr, PSUBDrr, PSUBQrr,
1569//       PCMPGTBrr, PCMPGTWrr, PCMPGTDrr, PCMPGTQrr are not zero-cycle!
1570def : InstRW<[Zn3WriteVZeroIdiomALUX],
1571             (instrs VPSUBBrr, VPSUBWrr, VPSUBDrr, VPSUBQrr,
1572                     VPCMPGTBrr, VPCMPGTWrr, VPCMPGTDrr, VPCMPGTQrr)>;
1573
1574def Zn3WriteVZeroIdiomALUY : SchedWriteVariant<[
1575    SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn3WriteZeroLatency]>,
1576    SchedVar<NoSchedPred,                          [WriteVecALUY]>
1577]>;
1578def : InstRW<[Zn3WriteVZeroIdiomALUY],
1579             (instrs VPSUBBYrr, VPSUBWYrr, VPSUBDYrr, VPSUBQYrr,
1580                     VPCMPGTBYrr, VPCMPGTWYrr, VPCMPGTDYrr, VPCMPGTQYrr)>;
1581
1582def : IsZeroIdiomFunction<[
1583  // GPR Zero-idioms.
1584  DepBreakingClass<[ XOR32rr, XOR32rr_REV,
1585                     XOR64rr, XOR64rr_REV,
1586                     SUB32rr, SUB32rr_REV,
1587                     SUB64rr, SUB64rr_REV ], ZeroIdiomPredicate>,
1588
1589  // SSE XMM Zero-idioms.
1590  DepBreakingClass<[
1591    // fp variants.
1592    XORPSrr, XORPDrr,
1593    ANDNPSrr, ANDNPDrr,
1594
1595    // int variants.
1596    PXORrr,
1597    PANDNrr,
1598    PSUBBrr, PSUBWrr, PSUBDrr, PSUBQrr,
1599    PSUBSBrr, PSUBSWrr,
1600    PSUBUSBrr, PSUBUSWrr,
1601    PCMPGTBrr, PCMPGTWrr, PCMPGTDrr, PCMPGTQrr
1602  ], ZeroIdiomPredicate>,
1603
1604  // AVX XMM Zero-idioms.
1605  DepBreakingClass<[
1606    // fp variants.
1607    VXORPSrr, VXORPDrr,
1608    VANDNPSrr, VANDNPDrr,
1609
1610    // int variants.
1611    VPXORrr,
1612    VPANDNrr,
1613    VPSUBBrr, VPSUBWrr, VPSUBDrr, VPSUBQrr,
1614    VPSUBSBrr, VPSUBSWrr,
1615    VPSUBUSBrr, VPSUBUSWrr,
1616    VPCMPGTBrr, VPCMPGTWrr, VPCMPGTDrr, VPCMPGTQrr,
1617  ], ZeroIdiomPredicate>,
1618
1619  // AVX YMM Zero-idioms.
1620  DepBreakingClass<[
1621    // fp variants.
1622    VXORPSYrr, VXORPDYrr,
1623    VANDNPSYrr, VANDNPDYrr,
1624
1625    // int variants.
1626    VPXORYrr,
1627    VPANDNYrr,
1628    VPSUBBYrr, VPSUBWYrr, VPSUBDYrr, VPSUBQYrr,
1629    VPSUBSBYrr, VPSUBSWYrr,
1630    VPSUBUSBYrr, VPSUBUSWYrr,
1631    VPCMPGTBYrr, VPCMPGTWYrr, VPCMPGTDYrr, VPCMPGTQYrr
1632  ], ZeroIdiomPredicate>,
1633]>;
1634
1635def : IsDepBreakingFunction<[
1636  // GPR
1637  DepBreakingClass<[ SBB32rr, SBB32rr_REV,
1638                     SBB64rr, SBB64rr_REV ], ZeroIdiomPredicate>,
1639  DepBreakingClass<[ CMP8rr,  CMP8rr_REV,
1640                     CMP16rr, CMP16rr_REV,
1641                     CMP32rr, CMP32rr_REV,
1642                     CMP64rr, CMP64rr_REV ], CheckSameRegOperand<0, 1> >,
1643
1644  // MMX
1645  DepBreakingClass<[
1646    MMX_PCMPEQBirr, MMX_PCMPEQWirr, MMX_PCMPEQDirr
1647  ], ZeroIdiomPredicate>,
1648
1649  // SSE
1650  DepBreakingClass<[
1651    PCMPEQBrr, PCMPEQWrr, PCMPEQDrr, PCMPEQQrr
1652  ], ZeroIdiomPredicate>,
1653
1654  // AVX XMM
1655  DepBreakingClass<[
1656    VPCMPEQBrr, VPCMPEQWrr, VPCMPEQDrr, VPCMPEQQrr
1657  ], ZeroIdiomPredicate>,
1658
1659  // AVX YMM
1660  DepBreakingClass<[
1661    VPCMPEQBYrr, VPCMPEQWYrr, VPCMPEQDYrr, VPCMPEQQYrr
1662  ], ZeroIdiomPredicate>,
1663]>;
1664
1665} // SchedModel
1666