1//=- X86ScheduleZnver4.td - X86 Znver4 Scheduling ------------*- tablegen -*-=//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines the machine model for Znver4 to support instruction
10// scheduling and other instruction cost heuristics.
11// Based on:
12//  * AMD Software Optimization Guide for AMD Family 19h Processors.
13//    https://www.amd.com/system/files/TechDocs/56665.zip
14//===----------------------------------------------------------------------===//
15
16def Znver4Model : SchedMachineModel {
17  // AMD SOG 19h, 2.9.6 Dispatch
18  // The processor may dispatch up to 6 macro ops per cycle
19  // into the execution engine.
20  let IssueWidth = 6;
21  // AMD SOG 19h, 2.10.3
22  // The retire control unit (RCU) tracks the completion status of all
23  // outstanding operations (integer, load/store, and floating-point) and is
24  // the final arbiter for exception processing and recovery.
25  // The unit can receive up to 6 macro ops dispatched per cycle and track up
26  // to 320 macro ops in-flight in non-SMT mode or 160 per thread in SMT mode.
27  let MicroOpBufferSize = 320;
28  // AMD SOG 19h, 2.9.1 Op Cache
29  // The op cache is organized as an associative cache with 64 sets and 8 ways.
30  // At each set-way intersection is an entry containing up to 8 macro ops.
31  // The maximum capacity of the op cache is 4K ops.
32  // Agner, 22.5 µop cache
33  // The size of the µop cache is big enough for holding most critical loops.
34  // FIXME: PR50584: MachineScheduler/PostRAScheduler have quadradic complexity,
35  //        with large values here the compilation of certain loops
36  //        ends up taking way too long.
37  // Ideally for znver4, we should have 6.75K. However we don't add that
38  // considerting the impact compile time and prefer using default values
39  // instead.
40  // Retaining minimal value to influence unrolling as we did for znver3.
41  let LoopMicroOpBufferSize = 512;
42  // AMD SOG 19h, 2.6.2 L1 Data Cache
43  // The L1 data cache has a 4- or 5- cycle integer load-to-use latency.
44  // AMD SOG 19h, 2.12 L1 Data Cache
45  // The AGU and LS pipelines are optimized for simple address generation modes.
46  // <...> and can achieve 4-cycle load-to-use integer load latency.
47  let LoadLatency = 4;
48  // AMD SOG 19h, 2.12 L1 Data Cache
49  // The AGU and LS pipelines are optimized for simple address generation modes.
50  // <...> and can achieve <...> 7-cycle load-to-use FP load latency.
51  int VecLoadLatency = 7;
52  // Latency of a simple store operation.
53  int StoreLatency = 1;
54  // FIXME:
55  let HighLatency = 25; // FIXME: any better choice?
56  // AMD SOG 19h, 2.8 Optimizing Branching
57  // The branch misprediction penalty is in the range from 11 to 18 cycles,
58  // <...>. The common case penalty is 13 cycles.
59  let MispredictPenalty = 13;
60
61  let PostRAScheduler = 1; // Enable Post RegAlloc Scheduler pass.
62
63  let CompleteModel = 1;
64}
65
66let SchedModel = Znver4Model in {
67
68
69//===----------------------------------------------------------------------===//
70// RCU
71//===----------------------------------------------------------------------===//
72
73// AMD SOG 19h, 2.10.3 Retire Control Unit
74// The unit can receive up to 6 macro ops dispatched per cycle and track up to
75// 320 macro ops in-flight in non-SMT mode or 128 per thread in SMT mode. <...>
76// The retire unit handles in-order commit of up to nine macro ops per cycle.
77def Zn4RCU : RetireControlUnit<Znver4Model.MicroOpBufferSize, 9>;
78
79//===----------------------------------------------------------------------===//
80// Integer Execution Unit
81//
82
83// AMD SOG 19h, 2.4 Superscalar Organization
84// The processor uses four decoupled independent integer scheduler queues,
85// each one servicing one ALU pipeline and one or two other pipelines
86
87//
88// Execution pipes
89//===----------------------------------------------------------------------===//
90
91// AMD SOG 19h, 2.10.2 Execution Units
92// The processor contains 4 general purpose integer execution pipes.
93// Each pipe has an ALU capable of general purpose integer operations.
94def Zn4ALU0 : ProcResource<1>;
95def Zn4ALU1 : ProcResource<1>;
96def Zn4ALU2 : ProcResource<1>;
97def Zn4ALU3 : ProcResource<1>;
98
99// AMD SOG 19h, 2.10.2 Execution Units
100// There is also a separate branch execution unit.
101def Zn4BRU1 : ProcResource<1>;
102
103// AMD SOG 19h, 2.10.2 Execution Units
104// There are three Address Generation Units (AGUs) for all load and store
105// address generation. There are also 3 store data movement units
106// associated with the same schedulers as the AGUs.
107def Zn4AGU0 : ProcResource<1>;
108def Zn4AGU1 : ProcResource<1>;
109def Zn4AGU2 : ProcResource<1>;
110
111//
112// Execution Units
113//===----------------------------------------------------------------------===//
114
115// AMD SOG 19h, 2.10.2 Execution Units
116// ALU0 additionally has divide <...> execution capability.
117defvar Zn4Divider = Zn4ALU0;
118
119// AMD SOG 19h, 2.10.2 Execution Units
120// ALU0 additionally has <...> branch execution capability.
121defvar Zn4BRU0 = Zn4ALU0;
122
123// Integer Multiplication issued on ALU1.
124defvar Zn4Multiplier = Zn4ALU1;
125
126// Execution pipeline grouping
127//===----------------------------------------------------------------------===//
128
129// General ALU operations
130def Zn4ALU0123 : ProcResGroup<[Zn4ALU0, Zn4ALU1, Zn4ALU2, Zn4ALU3]>;
131
132// General AGU operations
133def Zn4AGU012 : ProcResGroup<[Zn4AGU0, Zn4AGU1, Zn4AGU2]>;
134
135// Control flow: jumps, calls
136def Zn4BRU01 : ProcResGroup<[Zn4BRU0, Zn4BRU1]>;
137
138// Everything that isn't control flow, but still needs to access CC register,
139// namely: conditional moves, SETcc.
140def Zn4ALU03 : ProcResGroup<[Zn4ALU0, Zn4ALU3]>;
141
142// Zn4ALU1 handles complex bit twiddling: CRC/PDEP/PEXT
143
144// Simple bit twiddling: bit test, shift/rotate, bit extraction
145def Zn4ALU12 : ProcResGroup<[Zn4ALU1, Zn4ALU2]>;
146
147
148//
149// Scheduling
150//===----------------------------------------------------------------------===//
151
152// AMD SOG 19h, 2.10.3 Retire Control Unit
153// The integer physical register file (PRF) consists of 224 registers.
154def Zn4IntegerPRF : RegisterFile<224, [GR64, CCR], [1, 1], [1, 0],
155                              6,  // Max moves that can be eliminated per cycle.
156                              0>; // Restrict move elimination to zero regs.
157
158// anandtech, The integer scheduler has a 4*24 entry macro op capacity.
159// AMD SOG 19h, 2.10.1 Schedulers
160// The schedulers can receive up to six macro ops per cycle, with a limit of
161// two per scheduler. Each scheduler can issue one micro op per cycle into
162// each of its associated pipelines
163def Zn4Int : ProcResGroup<[Zn4ALU0, Zn4AGU0, Zn4BRU0, // scheduler 0
164                           Zn4ALU1, Zn4AGU1,          // scheduler 1
165                           Zn4ALU2, Zn4AGU2,          // scheduler 2
166                           Zn4ALU3,          Zn4BRU1  // scheduler 3
167                          ]> {
168  let BufferSize = !mul(4, 24);
169}
170
171
172//===----------------------------------------------------------------------===//
173// Floating-Point Unit
174//
175
176// AMD SOG 19h, 2.4 Superscalar Organization
177// The processor uses <...> two decoupled independent floating point schedulers
178// each servicing two FP pipelines and one store or FP-to-integer pipeline.
179
180//
181// Execution pipes
182//===----------------------------------------------------------------------===//
183
184// AMD SOG 19h, 2.10.1 Schedulers
185// <...>, and six FPU pipes.
186// Agner, 22.10 Floating point execution pipes
187// There are six floating point/vector execution pipes,
188def Zn4FP0  : ProcResource<1>;
189def Zn4FP1  : ProcResource<1>;
190def Zn4FP2  : ProcResource<1>;
191def Zn4FP3  : ProcResource<1>;
192def Zn4FP45 : ProcResource<2>;
193
194//
195// Execution Units
196//===----------------------------------------------------------------------===//
197// AMD SOG 19h, 2.11.1 Floating Point Execution Resources
198
199// (v)FMUL*, (v)FMA*, Floating Point Compares, Blendv(DQ)
200defvar Zn4FPFMul0 = Zn4FP0;
201defvar Zn4FPFMul1 = Zn4FP1;
202
203// (v)FADD*
204defvar Zn4FPFAdd0 = Zn4FP2;
205defvar Zn4FPFAdd1 = Zn4FP3;
206
207// All convert operations except pack/unpack
208defvar Zn4FPFCvt0 = Zn4FP2;
209defvar Zn4FPFCvt1 = Zn4FP3;
210
211// All Divide and Square Root except Reciprocal Approximation
212// AMD SOG 19h, 2.11.1 Floating Point Execution Resources
213// FDIV unit can support 2 simultaneous operations in flight
214// even though it occupies a single pipe.
215// FIXME: BufferSize=2 ?
216defvar Zn4FPFDiv = Zn4FP1;
217
218// Moves and Logical operations on Floating Point Data Types
219defvar Zn4FPFMisc0 = Zn4FP0;
220defvar Zn4FPFMisc1 = Zn4FP1;
221defvar Zn4FPFMisc2 = Zn4FP2;
222defvar Zn4FPFMisc3 = Zn4FP3;
223
224// Integer Adds, Subtracts, and Compares
225// Some complex VADD operations are not available in all pipes.
226defvar Zn4FPVAdd0 = Zn4FP0;
227defvar Zn4FPVAdd1 = Zn4FP1;
228defvar Zn4FPVAdd2 = Zn4FP2;
229defvar Zn4FPVAdd3 = Zn4FP3;
230
231// Integer Multiplies, SAD, Blendvb
232defvar Zn4FPVMul0 = Zn4FP0;
233defvar Zn4FPVMul1 = Zn4FP3;
234
235// Data Shuffles, Packs, Unpacks, Permute
236// Some complex shuffle operations are only available in pipe1.
237defvar Zn4FPVShuf = Zn4FP1;
238defvar Zn4FPVShufAux = Zn4FP2;
239
240// Bit Shift Left/Right operations
241defvar Zn4FPVShift0 = Zn4FP1;
242defvar Zn4FPVShift1 = Zn4FP2;
243
244// Moves and Logical operations on Packed Integer Data Types
245defvar Zn4FPVMisc0 = Zn4FP0;
246defvar Zn4FPVMisc1 = Zn4FP1;
247defvar Zn4FPVMisc2 = Zn4FP2;
248defvar Zn4FPVMisc3 = Zn4FP3;
249
250// *AES*
251defvar Zn4FPAES0 = Zn4FP0;
252defvar Zn4FPAES1 = Zn4FP1;
253
254// *CLM*
255defvar Zn4FPCLM0 = Zn4FP0;
256defvar Zn4FPCLM1 = Zn4FP1;
257
258// Execution pipeline grouping
259//===----------------------------------------------------------------------===//
260
261// AMD SOG 19h, 2.11 Floating-Point Unit
262// Stores and floating point to general purpose register transfer
263// have 2 dedicated pipelines (pipe 5 and 6).
264def Zn4FPU0123 : ProcResGroup<[Zn4FP0, Zn4FP1, Zn4FP2, Zn4FP3]>;
265
266// (v)FMUL*, (v)FMA*, Floating Point Compares, Blendv(DQ)
267def Zn4FPFMul01 : ProcResGroup<[Zn4FPFMul0, Zn4FPFMul1]>;
268
269// (v)FADD*
270// Some complex VADD operations are not available in all pipes.
271def Zn4FPFAdd01 : ProcResGroup<[Zn4FPFAdd0, Zn4FPFAdd1]>;
272
273// All convert operations except pack/unpack
274def Zn4FPFCvt01 : ProcResGroup<[Zn4FPFCvt0, Zn4FPFCvt1]>;
275
276// All Divide and Square Root except Reciprocal Approximation
277// def Zn4FPFDiv : ProcResGroup<[Zn4FPFDiv]>;
278
279// Moves and Logical operations on Floating Point Data Types
280def Zn4FPFMisc0123 : ProcResGroup<[Zn4FPFMisc0, Zn4FPFMisc1, Zn4FPFMisc2, Zn4FPFMisc3]>;
281
282// FIXUP and RANGE use FP01 pipelines
283def Zn4FPFMisc01 : ProcResGroup<[Zn4FPFMisc0, Zn4FPFMisc1]>;
284def Zn4FPFMisc12 : ProcResGroup<[Zn4FPFMisc1, Zn4FPFMisc2]>;
285// SCALE instructions use FP23 pipelines
286def Zn4FPFMisc23 : ProcResGroup<[Zn4FPFMisc2, Zn4FPFMisc3]>;
287def Zn4FPFMisc123 : ProcResGroup<[Zn4FPFMisc1,Zn4FPFMisc2, Zn4FPFMisc3]>;
288
289// Loads, Stores and Move to General Register (EX) Operations
290// AMD SOG 19h, 2.11 Floating-Point Unit
291// Stores and floating point to general purpose register transfer
292// have 2 dedicated pipelines (pipe 5 and 6).
293defvar Zn4FPLd01 = Zn4FP45;
294
295// AMD SOG 19h, 2.11 Floating-Point Unit
296// Note that FP stores are supported on two pipelines,
297// but throughput is limited to one per cycle.
298let Super = Zn4FP45 in
299def Zn4FPSt : ProcResource<1>;
300
301// Integer Adds, Subtracts, and Compares
302// Some complex VADD operations are not available in all pipes.
303def Zn4FPVAdd0123 : ProcResGroup<[Zn4FPVAdd0, Zn4FPVAdd1, Zn4FPVAdd2, Zn4FPVAdd3]>;
304
305def Zn4FPVAdd01: ProcResGroup<[Zn4FPVAdd0, Zn4FPVAdd1]>;
306def Zn4FPVAdd12: ProcResGroup<[Zn4FPVAdd1, Zn4FPVAdd2]>;
307
308// AVX512 Opmask pipelines
309def Zn4FPOpMask01: ProcResGroup<[Zn4FP2, Zn4FP3]>;
310def Zn4FPOpMask4: ProcResGroup<[Zn4FP45]>;
311
312// Integer Multiplies, SAD, Blendvb
313def Zn4FPVMul01 : ProcResGroup<[Zn4FPVMul0, Zn4FPVMul1]>;
314
315// Data Shuffles, Packs, Unpacks, Permute
316// Some complex shuffle operations are only available in pipe1.
317def Zn4FPVShuf01 : ProcResGroup<[Zn4FPVShuf, Zn4FPVShufAux]>;
318
319// Bit Shift Left/Right operations
320def Zn4FPVShift01 : ProcResGroup<[Zn4FPVShift0, Zn4FPVShift1]>;
321
322// Moves and Logical operations on Packed Integer Data Types
323def Zn4FPVMisc0123 : ProcResGroup<[Zn4FPVMisc0, Zn4FPVMisc1, Zn4FPVMisc2, Zn4FPVMisc3]>;
324
325// *AES*
326def Zn4FPAES01 : ProcResGroup<[Zn4FPAES0, Zn4FPAES1]>;
327
328// *CLM*
329def Zn4FPCLM01 : ProcResGroup<[Zn4FPCLM0, Zn4FPCLM1]>;
330
331
332//
333// Scheduling
334//===----------------------------------------------------------------------===//
335
336// Agner, 21.8 Register renaming and out-of-order schedulers
337// The floating point register file has 192 vector registers
338// of 512b each in zen4.
339def Zn4FpPRF : RegisterFile<192, [VR64, VR128, VR256, VR512], [1, 1, 1, 1], [0, 1, 1],
340                            6,  // Max moves that can be eliminated per cycle.
341                            0>; // Restrict move elimination to zero regs.
342
343// AMD SOG 19h, 2.11 Floating-Point Unit
344// The floating-point scheduler has a 2*32 entry macro op capacity.
345// AMD SOG 19h, 2.11 Floating-Point Unit
346// <...> the scheduler can issue 1 micro op per cycle for each pipe.
347// FIXME: those are two separate schedulers, not a single big one.
348def Zn4FP : ProcResGroup<[Zn4FP0, Zn4FP2,          /*Zn4FP4,*/ // scheduler 0
349                          Zn4FP1, Zn4FP3, Zn4FP45 /*Zn4FP5*/  // scheduler 1
350                         ]> {
351  let BufferSize = !mul(2, 32);
352}
353
354// AMD SOG 19h, 2.11 Floating-Point Unit
355// Macro ops can be dispatched to the 64 entry Non Scheduling Queue (NSQ)
356// even if floating-point scheduler is full.
357// FIXME: how to model this properly?
358
359
360//===----------------------------------------------------------------------===//
361// Load-Store Unit
362//
363
364// AMD SOG 19h, 2.12 Load-Store Unit
365// The LS unit contains three largely independent pipe-lines
366// enabling the execution of three 256-bit memory operations per cycle.
367def Zn4LSU : ProcResource<3>;
368
369// AMD SOG 19h, 2.12 Load-Store Unit
370// All three memory operations can be loads.
371let Super = Zn4LSU in
372def Zn4Load : ProcResource<3> {
373  // AMD SOG 19h, 2.12 Load-Store Unit
374  // The LS unit can process up to 72 out-of-order loads.
375  let BufferSize = 72;
376}
377
378def Zn4LoadQueue : LoadQueue<Zn4Load>;
379
380// AMD SOG 19h, 2.12 Load-Store Unit
381// A maximum of two of the memory operations can be stores.
382let Super = Zn4LSU in
383def Zn4Store : ProcResource<2> {
384  // AMD SOG 19h, 2.12 Load-Store Unit
385  // The LS unit utilizes a 64-entry store queue (STQ).
386  let BufferSize = 64;
387}
388
389def Zn4StoreQueue : StoreQueue<Zn4Store>;
390
391//===----------------------------------------------------------------------===//
392// Basic helper classes.
393//===----------------------------------------------------------------------===//
394
395// Many SchedWrites are defined in pairs with and without a folded load.
396// Instructions with folded loads are usually micro-fused, so they only appear
397// as two micro-ops when dispatched by the schedulers.
398// This multiclass defines the resource usage for variants with and without
399// folded loads.
400
401multiclass __Zn4WriteRes<SchedWrite SchedRW, list<ProcResourceKind> ExePorts,
402                         int Lat = 1, list<int> Res = [], int UOps = 1> {
403  def : WriteRes<SchedRW, ExePorts> {
404    let Latency = Lat;
405    let ReleaseAtCycles = Res;
406    let NumMicroOps = UOps;
407  }
408}
409
410multiclass __Zn4WriteResPair<X86FoldableSchedWrite SchedRW,
411                             list<ProcResourceKind> ExePorts, int Lat,
412                             list<int> Res, int UOps, int LoadLat, int LoadUOps,
413                             ProcResourceKind AGU, int LoadRes> {
414  defm : __Zn4WriteRes<SchedRW, ExePorts, Lat, Res, UOps>;
415
416  defm : __Zn4WriteRes<SchedRW.Folded,
417                       !listconcat([AGU, Zn4Load], ExePorts),
418                       !add(Lat, LoadLat),
419                       !if(!and(!empty(Res), !eq(LoadRes, 1)),
420                         [],
421                         !listconcat([1, LoadRes],
422                           !if(!empty(Res),
423                             !listsplat(1, !size(ExePorts)),
424                             Res))),
425                       !add(UOps, LoadUOps)>;
426}
427
428// For classes without folded loads.
429multiclass Zn4WriteResInt<SchedWrite SchedRW,
430                          list<ProcResourceKind> ExePorts, int Lat = 1,
431                          list<int> Res = [], int UOps = 1> {
432  defm : __Zn4WriteRes<SchedRW, ExePorts, Lat, Res, UOps>;
433}
434
435multiclass Zn4WriteResXMM<SchedWrite SchedRW,
436                          list<ProcResourceKind> ExePorts, int Lat = 1,
437                          list<int> Res = [], int UOps = 1> {
438  defm : __Zn4WriteRes<SchedRW, ExePorts, Lat, Res, UOps>;
439}
440
441multiclass Zn4WriteResYMM<SchedWrite SchedRW,
442                          list<ProcResourceKind> ExePorts, int Lat = 1,
443                          list<int> Res = [], int UOps = 1> {
444  defm : __Zn4WriteRes<SchedRW, ExePorts, Lat, Res, UOps>;
445}
446
447multiclass Zn4WriteResZMM<SchedWrite SchedRW,
448                          list<ProcResourceKind> ExePorts, int Lat = 1,
449                          list<int> Res = [], int UOps = 1> {
450  defm : __Zn4WriteRes<SchedRW, ExePorts, Lat, Res, UOps>;
451}
452
453// For classes with folded loads.
454multiclass Zn4WriteResIntPair<X86FoldableSchedWrite SchedRW,
455                              list<ProcResourceKind> ExePorts, int Lat = 1,
456                              list<int> Res = [], int UOps = 1,
457                              int LoadUOps = 0, int LoadRes = 1> {
458  defm : __Zn4WriteResPair<SchedRW, ExePorts, Lat, Res, UOps,
459                           Znver4Model.LoadLatency,
460                           LoadUOps, Zn4AGU012, LoadRes>;
461}
462
463multiclass Zn4WriteResXMMPair<X86FoldableSchedWrite SchedRW,
464                              list<ProcResourceKind> ExePorts, int Lat = 1,
465                              list<int> Res = [], int UOps = 1,
466                              int LoadUOps = 0, int LoadRes = 1> {
467  defm : __Zn4WriteResPair<SchedRW, ExePorts, Lat, Res, UOps,
468                           Znver4Model.VecLoadLatency,
469                           LoadUOps, Zn4FPLd01, LoadRes>;
470}
471
472multiclass Zn4WriteResYMMPair<X86FoldableSchedWrite SchedRW,
473                              list<ProcResourceKind> ExePorts, int Lat = 1,
474                              list<int> Res = [], int UOps = 1,
475                              int LoadUOps = 0, int LoadRes = 1> {
476  defm : __Zn4WriteResPair<SchedRW, ExePorts, Lat, Res, UOps,
477                           Znver4Model.VecLoadLatency,
478                           LoadUOps, Zn4FPLd01, LoadRes>;
479}
480
481multiclass Zn4WriteResZMMPair<X86FoldableSchedWrite SchedRW,
482                              list<ProcResourceKind> ExePorts, int Lat = 1,
483                              list<int> Res = [], int UOps = 2,
484                              int LoadUOps = 0, int LoadRes = 1> {
485  defm : __Zn4WriteResPair<SchedRW, ExePorts, Lat, Res, UOps,
486                           Znver4Model.VecLoadLatency,
487                           LoadUOps, Zn4FPLd01, LoadRes>;
488}
489
490//===----------------------------------------------------------------------===//
491// Here be dragons.
492//===----------------------------------------------------------------------===//
493
494def : ReadAdvance<ReadAfterLd, Znver4Model.LoadLatency>;
495
496def : ReadAdvance<ReadAfterVecLd, Znver4Model.VecLoadLatency>;
497def : ReadAdvance<ReadAfterVecXLd, Znver4Model.VecLoadLatency>;
498def : ReadAdvance<ReadAfterVecYLd, Znver4Model.VecLoadLatency>;
499
500// AMD SOG 19h, 2.11 Floating-Point Unit
501// There is 1 cycle of added latency for a result to cross
502// from F to I or I to F domain.
503def : ReadAdvance<ReadInt2Fpu, -1>;
504
505// Instructions with both a load and a store folded are modeled as a folded
506// load + WriteRMW.
507defm : Zn4WriteResInt<WriteRMW, [Zn4AGU012, Zn4Store], Znver4Model.StoreLatency, [1, 1], 0>;
508
509// Loads, stores, and moves, not folded with other operations.
510defm : Zn4WriteResInt<WriteLoad, [Zn4AGU012, Zn4Load], !add(Znver4Model.LoadLatency, 1), [1, 1], 1>;
511
512// Model the effect of clobbering the read-write mask operand of the GATHER operation.
513// Does not cost anything by itself, only has latency, matching that of the WriteLoad,
514defm : Zn4WriteResInt<WriteVecMaskedGatherWriteback, [], !add(Znver4Model.LoadLatency, 1), [], 0>;
515
516def Zn4WriteMOVSlow : SchedWriteRes<[Zn4AGU012, Zn4Load]> {
517  let Latency = !add(Znver4Model.LoadLatency, 1);
518  let ReleaseAtCycles = [3, 1];
519  let NumMicroOps = 1;
520}
521def : InstRW<[Zn4WriteMOVSlow], (instrs MOV8rm, MOV8rm_NOREX, MOV16rm, MOVSX16rm16, MOVSX16rm32, MOVZX16rm16, MOVSX16rm8, MOVZX16rm8)>;
522
523defm : Zn4WriteResInt<WriteStore, [Zn4AGU012, Zn4Store], Znver4Model.StoreLatency, [1, 2], 1>;
524defm : Zn4WriteResInt<WriteStoreNT, [Zn4AGU012, Zn4Store], Znver4Model.StoreLatency, [1, 2], 1>;
525defm : Zn4WriteResInt<WriteMove, [Zn4ALU0123], 1, [4], 1>;
526
527// Treat misc copies as a move.
528def : InstRW<[WriteMove], (instrs COPY)>;
529
530def Zn4WriteMOVBE16rm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4ALU0123]> {
531  let Latency = Znver4Model.LoadLatency;
532  let ReleaseAtCycles = [1, 1, 4];
533  let NumMicroOps = 1;
534}
535def : InstRW<[Zn4WriteMOVBE16rm], (instrs MOVBE16rm)>;
536
537def Zn4WriteMOVBEmr : SchedWriteRes<[Zn4ALU0123, Zn4AGU012, Zn4Store]> {
538  let Latency = Znver4Model.StoreLatency;
539  let ReleaseAtCycles = [4, 1, 1];
540  let NumMicroOps = 2;
541}
542def : InstRW<[Zn4WriteMOVBEmr], (instrs MOVBE16mr, MOVBE32mr, MOVBE64mr)>;
543
544// Arithmetic.
545defm : Zn4WriteResIntPair<WriteALU, [Zn4ALU0123], 1, [1], 1>; // Simple integer ALU op.
546
547def Zn4WriteALUSlow : SchedWriteRes<[Zn4ALU0123]> {
548  let Latency = 1;
549  let ReleaseAtCycles = [4];
550  let NumMicroOps = 1;
551}
552def : InstRW<[Zn4WriteALUSlow], (instrs ADD8i8, ADD16i16, ADD32i32, ADD64i32,
553                                        AND8i8, AND16i16, AND32i32, AND64i32,
554                                         OR8i8,  OR16i16,  OR32i32,  OR64i32,
555                                        SUB8i8, SUB16i16, SUB32i32, SUB64i32,
556                                        XOR8i8, XOR16i16, XOR32i32, XOR64i32)>;
557
558def Zn4WriteMoveExtend : SchedWriteRes<[Zn4ALU0123]> {
559  let Latency = 1;
560  let ReleaseAtCycles = [4];
561  let NumMicroOps = 1;
562}
563def : InstRW<[Zn4WriteMoveExtend], (instrs MOVSX16rr16, MOVSX16rr32, MOVZX16rr16, MOVSX16rr8, MOVZX16rr8)>;
564
565def Zn4WriteMaterialize32bitImm: SchedWriteRes<[Zn4ALU0123]> {
566  let Latency = 1;
567  let ReleaseAtCycles = [2];
568  let NumMicroOps = 1;
569}
570def : InstRW<[Zn4WriteMaterialize32bitImm], (instrs MOV32ri, MOV32ri_alt, MOV64ri32)>;
571
572def Zn4WritePDEP_PEXT : SchedWriteRes<[Zn4ALU1]> {
573  let Latency = 3;
574  let ReleaseAtCycles = [1];
575  let NumMicroOps = 1;
576}
577def : InstRW<[Zn4WritePDEP_PEXT], (instrs PDEP32rr, PDEP64rr,
578                                          PEXT32rr, PEXT64rr)>;
579
580defm : Zn4WriteResIntPair<WriteADC, [Zn4ALU0123], 1, [4], 1>; // Integer ALU + flags op.
581
582def Zn4WriteADC8mr_SBB8mr : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4ALU0123, Zn4Store]> {
583  let Latency = 1;
584  let ReleaseAtCycles = [1, 1, 7, 1];
585  let NumMicroOps = 1;
586}
587def : InstRW<[Zn4WriteADC8mr_SBB8mr], (instrs ADC8mr, SBB8mr)>;
588
589// This is for simple LEAs with one or two input operands.
590defm : Zn4WriteResInt<WriteLEA, [Zn4AGU012], 1, [1], 1>;     // LEA instructions can't fold loads.
591
592// This write is used for slow LEA instructions.
593def Zn4Write3OpsLEA : SchedWriteRes<[Zn4ALU0123]> {
594  let Latency = 2;
595  let ReleaseAtCycles = [1];
596  let NumMicroOps = 2;
597}
598
599// On Znver4, a slow LEA is either a 3Ops LEA (base, index, offset),
600// or an LEA with a `Scale` value different than 1.
601def Zn4SlowLEAPredicate : MCSchedPredicate<
602  CheckAny<[
603    // A 3-operand LEA (base, index, offset).
604    IsThreeOperandsLEAFn,
605    // An LEA with a "Scale" different than 1.
606    CheckAll<[
607      CheckIsImmOperand<2>,
608      CheckNot<CheckImmOperand<2, 1>>
609    ]>
610  ]>
611>;
612
613def Zn4WriteLEA : SchedWriteVariant<[
614    SchedVar<Zn4SlowLEAPredicate, [Zn4Write3OpsLEA]>,
615    SchedVar<NoSchedPred,         [WriteLEA]>
616]>;
617
618def : InstRW<[Zn4WriteLEA], (instrs LEA32r, LEA64r, LEA64_32r)>;
619
620def Zn4SlowLEA16r : SchedWriteRes<[Zn4ALU0123]> {
621  let Latency = 2; // FIXME: not from llvm-exegesis
622  let ReleaseAtCycles = [4];
623  let NumMicroOps = 2;
624}
625
626def : InstRW<[Zn4SlowLEA16r], (instrs LEA16r)>;
627
628// Integer multiplication
629defm : Zn4WriteResIntPair<WriteIMul8, [Zn4Multiplier], 3, [3], 1>; // Integer 8-bit multiplication.
630defm : Zn4WriteResIntPair<WriteIMul16, [Zn4Multiplier], 3, [3], 3, /*LoadUOps=*/1>; // Integer 16-bit multiplication.
631defm : Zn4WriteResIntPair<WriteIMul16Imm, [Zn4Multiplier], 4, [4], 2>; // Integer 16-bit multiplication by immediate.
632defm : Zn4WriteResIntPair<WriteIMul16Reg, [Zn4Multiplier], 3, [1], 1>; // Integer 16-bit multiplication by register.
633defm : Zn4WriteResIntPair<WriteIMul32, [Zn4Multiplier], 3, [3], 2>;    // Integer 32-bit multiplication.
634defm : Zn4WriteResIntPair<WriteMULX32, [Zn4Multiplier], 3, [1], 2>;    // Integer 32-bit Unsigned Multiply Without Affecting Flags.
635defm : Zn4WriteResIntPair<WriteIMul32Imm, [Zn4Multiplier], 3, [1], 1>; // Integer 32-bit multiplication by immediate.
636defm : Zn4WriteResIntPair<WriteIMul32Reg, [Zn4Multiplier], 3, [1], 1>; // Integer 32-bit multiplication by register.
637defm : Zn4WriteResIntPair<WriteIMul64, [Zn4Multiplier], 3, [3], 2>;    // Integer 64-bit multiplication.
638defm : Zn4WriteResIntPair<WriteMULX64, [Zn4Multiplier], 3, [1], 2>;    // Integer 32-bit Unsigned Multiply Without Affecting Flags.
639defm : Zn4WriteResIntPair<WriteIMul64Imm, [Zn4Multiplier], 3, [1], 1>; // Integer 64-bit multiplication by immediate.
640defm : Zn4WriteResIntPair<WriteIMul64Reg, [Zn4Multiplier], 3, [1], 1>; // Integer 64-bit multiplication by register.
641defm : Zn4WriteResInt<WriteIMulHLd, [], !add(4, Znver4Model.LoadLatency), [], 0>;  // Integer multiplication, high part.
642defm : Zn4WriteResInt<WriteIMulH, [], 4, [], 0>;  // Integer multiplication, high part.
643
644defm : Zn4WriteResInt<WriteBSWAP32, [Zn4ALU0123], 1, [1], 1>; // Byte Order (Endianness) 32-bit Swap.
645defm : Zn4WriteResInt<WriteBSWAP64, [Zn4ALU0123], 1, [1], 1>; // Byte Order (Endianness) 64-bit Swap.
646
647defm : Zn4WriteResIntPair<WriteCMPXCHG, [Zn4ALU0123], 3, [12], 5>; // Compare and set, compare and swap.
648
649def Zn4WriteCMPXCHG8rr : SchedWriteRes<[Zn4ALU0123]> {
650  let Latency = 3;
651  let ReleaseAtCycles = [12];
652  let NumMicroOps = 3;
653}
654def : InstRW<[Zn4WriteCMPXCHG8rr], (instrs CMPXCHG8rr)>;
655
656defm : Zn4WriteResInt<WriteCMPXCHGRMW, [Zn4ALU0123], 3, [12], 6>;     // Compare and set, compare and swap.
657
658def Zn4WriteCMPXCHG8rm_LCMPXCHG8 : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4ALU0123]> {
659  let Latency = !add(Znver4Model.LoadLatency, Zn4WriteCMPXCHG8rr.Latency);
660  let ReleaseAtCycles = [1, 1, 12];
661  let NumMicroOps = !add(Zn4WriteCMPXCHG8rr.NumMicroOps, 2);
662}
663def : InstRW<[Zn4WriteCMPXCHG8rm_LCMPXCHG8], (instrs CMPXCHG8rm, LCMPXCHG8)>;
664
665def Zn4WriteCMPXCHG8B : SchedWriteRes<[Zn4ALU0123]> {
666  let Latency = 3; // FIXME: not from llvm-exegesis
667  let ReleaseAtCycles = [24];
668  let NumMicroOps = 19;
669}
670def : InstRW<[Zn4WriteCMPXCHG8B], (instrs CMPXCHG8B)>;
671
672def Zn4WriteCMPXCHG16B_LCMPXCHG16B : SchedWriteRes<[Zn4ALU0123]> {
673  let Latency = 4; // FIXME: not from llvm-exegesis
674  let ReleaseAtCycles = [59];
675  let NumMicroOps = 28;
676}
677def : InstRW<[Zn4WriteCMPXCHG16B_LCMPXCHG16B], (instrs CMPXCHG16B, LCMPXCHG16B)>;
678
679def Zn4WriteWriteXCHGUnrenameable : SchedWriteRes<[Zn4ALU0123]> {
680  let Latency = 1;
681  let ReleaseAtCycles = [2];
682  let NumMicroOps = 2;
683}
684def : InstRW<[Zn4WriteWriteXCHGUnrenameable], (instrs XCHG8rr, XCHG16rr, XCHG16ar)>;
685
686def Zn4WriteXCHG8rm_XCHG16rm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4ALU0123]> {
687  let Latency = !add(Znver4Model.LoadLatency, 3); // FIXME: not from llvm-exegesis
688  let ReleaseAtCycles = [1, 1, 2];
689  let NumMicroOps = 5;
690}
691def : InstRW<[Zn4WriteXCHG8rm_XCHG16rm], (instrs XCHG8rm, XCHG16rm)>;
692
693def Zn4WriteXCHG32rm_XCHG64rm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4ALU0123]> {
694  let Latency = !add(Znver4Model.LoadLatency, 2); // FIXME: not from llvm-exegesis
695  let ReleaseAtCycles = [1, 1, 2];
696  let NumMicroOps = 2;
697}
698def : InstRW<[Zn4WriteXCHG32rm_XCHG64rm], (instrs XCHG32rm, XCHG64rm)>;
699
700// Integer division.
701// FIXME: uops for 8-bit division measures as 2. for others it's a guess.
702// FIXME: latency for 8-bit division measures as 10. for others it's a guess.
703defm : Zn4WriteResIntPair<WriteDiv8, [Zn4Divider], 10, [10], 2>;
704defm : Zn4WriteResIntPair<WriteDiv16, [Zn4Divider], 11, [11], 2>;
705defm : Zn4WriteResIntPair<WriteDiv32, [Zn4Divider], 13, [13], 2>;
706defm : Zn4WriteResIntPair<WriteDiv64, [Zn4Divider], 17, [17], 2>;
707defm : Zn4WriteResIntPair<WriteIDiv8, [Zn4Divider], 10, [10], 2>;
708defm : Zn4WriteResIntPair<WriteIDiv16, [Zn4Divider], 11, [11], 2>;
709defm : Zn4WriteResIntPair<WriteIDiv32, [Zn4Divider], 13, [13], 2>;
710defm : Zn4WriteResIntPair<WriteIDiv64, [Zn4Divider], 17, [17], 2>;
711
712defm : Zn4WriteResIntPair<WriteBSF, [Zn4ALU1], 1, [1], 6, /*LoadUOps=*/1>; // Bit scan forward.
713defm : Zn4WriteResIntPair<WriteBSR, [Zn4ALU1], 1, [1], 6, /*LoadUOps=*/1>; // Bit scan reverse.
714
715defm : Zn4WriteResIntPair<WritePOPCNT, [Zn4ALU0123], 1, [1], 1>; // Bit population count.
716
717def Zn4WritePOPCNT16rr : SchedWriteRes<[Zn4ALU0123]> {
718  let Latency = 1;
719  let ReleaseAtCycles = [4];
720  let NumMicroOps = 1;
721}
722def : InstRW<[Zn4WritePOPCNT16rr], (instrs POPCNT16rr)>;
723
724defm : Zn4WriteResIntPair<WriteLZCNT, [Zn4ALU0123], 1, [1], 1>; // Leading zero count.
725
726def Zn4WriteLZCNT16rr : SchedWriteRes<[Zn4ALU0123]> {
727  let Latency = 1;
728  let ReleaseAtCycles = [4];
729  let NumMicroOps = 1;
730}
731def : InstRW<[Zn4WriteLZCNT16rr], (instrs LZCNT16rr)>;
732
733defm : Zn4WriteResIntPair<WriteTZCNT, [Zn4ALU12], 2, [1], 2>; // Trailing zero count.
734
735def Zn4WriteTZCNT16rr : SchedWriteRes<[Zn4ALU0123]> {
736  let Latency = 2;
737  let ReleaseAtCycles = [4];
738  let NumMicroOps = 2;
739}
740def : InstRW<[Zn4WriteTZCNT16rr], (instrs TZCNT16rr)>;
741
742defm : Zn4WriteResIntPair<WriteCMOV, [Zn4ALU03], 1, [1], 1>; // Conditional move.
743defm : Zn4WriteResInt<WriteFCMOV, [Zn4ALU0123], 7, [28], 7>; // FIXME: not from llvm-exegesis // X87 conditional move.
744defm : Zn4WriteResInt<WriteSETCC, [Zn4ALU03], 1, [2], 1>; // Set register based on condition code.
745defm : Zn4WriteResInt<WriteSETCCStore, [Zn4ALU03, Zn4AGU012, Zn4Store], 2, [2, 1, 1], 2>; // FIXME: latency not from llvm-exegesis
746defm : Zn4WriteResInt<WriteLAHFSAHF, [Zn4ALU3], 1, [1], 1>; // Load/Store flags in AH.
747
748defm : Zn4WriteResInt<WriteBitTest, [Zn4ALU12], 1, [1], 1>; // Bit Test
749defm : Zn4WriteResInt<WriteBitTestImmLd, [Zn4AGU012, Zn4Load, Zn4ALU12], !add(Znver4Model.LoadLatency, 1), [1, 1, 1], 2>;
750defm : Zn4WriteResInt<WriteBitTestRegLd, [Zn4AGU012, Zn4Load, Zn4ALU12], !add(Znver4Model.LoadLatency, 1), [1, 1, 1], 7>;
751
752defm : Zn4WriteResInt<WriteBitTestSet, [Zn4ALU12], 2, [2], 2>; // Bit Test + Set
753defm : Zn4WriteResInt<WriteBitTestSetImmLd, [Zn4AGU012, Zn4Load, Zn4ALU12], !add(Znver4Model.LoadLatency, 2), [1, 1, 1], 4>;
754defm : Zn4WriteResInt<WriteBitTestSetRegLd, [Zn4AGU012, Zn4Load, Zn4ALU12], !add(Znver4Model.LoadLatency, 2), [1, 1, 1], 9>;
755
756// Integer shifts and rotates.
757defm : Zn4WriteResIntPair<WriteShift, [Zn4ALU12], 1, [1], 1, /*LoadUOps=*/1>;
758defm : Zn4WriteResIntPair<WriteShiftCL, [Zn4ALU12], 1, [1], 1, /*LoadUOps=*/1>;
759defm : Zn4WriteResIntPair<WriteRotate, [Zn4ALU12], 1, [1], 1, /*LoadUOps=*/1>;
760
761def Zn4WriteRotateR1 : SchedWriteRes<[Zn4ALU12]> {
762  let Latency = 1;
763  let ReleaseAtCycles = [2];
764  let NumMicroOps = 1;
765}
766def : InstRW<[Zn4WriteRotateR1], (instrs RCL8r1, RCL16r1, RCL32r1, RCL64r1,
767                                         RCR8r1, RCR16r1, RCR32r1, RCR64r1)>;
768
769def Zn4WriteRotateM1 : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4ALU12]> {
770  let Latency = !add(Znver4Model.LoadLatency, Zn4WriteRotateR1.Latency);
771  let ReleaseAtCycles = [1, 1, 2];
772  let NumMicroOps = !add(Zn4WriteRotateR1.NumMicroOps, 1);
773}
774def : InstRW<[Zn4WriteRotateM1], (instrs RCL8m1, RCL16m1, RCL32m1, RCL64m1,
775                                         RCR8m1, RCR16m1, RCR32m1, RCR64m1)>;
776
777def Zn4WriteRotateRightRI : SchedWriteRes<[Zn4ALU12]> {
778  let Latency = 3;
779  let ReleaseAtCycles = [6];
780  let NumMicroOps = 7;
781}
782def : InstRW<[Zn4WriteRotateRightRI], (instrs RCR8ri, RCR16ri, RCR32ri, RCR64ri)>;
783
784def Zn4WriteRotateRightMI : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4ALU12]> {
785  let Latency = !add(Znver4Model.LoadLatency, Zn4WriteRotateRightRI.Latency);
786  let ReleaseAtCycles = [1, 1, 8];
787  let NumMicroOps = !add(Zn4WriteRotateRightRI.NumMicroOps, 3);
788}
789def : InstRW<[Zn4WriteRotateRightMI], (instrs RCR8mi, RCR16mi, RCR32mi, RCR64mi)>;
790
791def Zn4WriteRotateLeftRI : SchedWriteRes<[Zn4ALU12]> {
792  let Latency = 4;
793  let ReleaseAtCycles = [8];
794  let NumMicroOps = 9;
795}
796def : InstRW<[Zn4WriteRotateLeftRI], (instrs RCL8ri, RCL16ri, RCL32ri, RCL64ri)>;
797
798def Zn4WriteRotateLeftMI : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4ALU12]> {
799  let Latency = !add(Znver4Model.LoadLatency, Zn4WriteRotateLeftRI.Latency);
800  let ReleaseAtCycles = [1, 1, 8];
801  let NumMicroOps = !add(Zn4WriteRotateLeftRI.NumMicroOps, 2);
802}
803def : InstRW<[Zn4WriteRotateLeftMI], (instrs RCL8mi, RCL16mi, RCL32mi, RCL64mi)>;
804
805defm : Zn4WriteResIntPair<WriteRotateCL, [Zn4ALU12], 1, [1], 1, /*LoadUOps=*/1>;
806
807def Zn4WriteRotateRightRCL : SchedWriteRes<[Zn4ALU12]> {
808  let Latency = 3;
809  let ReleaseAtCycles = [6];
810  let NumMicroOps = 7;
811}
812def : InstRW<[Zn4WriteRotateRightRCL], (instrs RCR8rCL, RCR16rCL, RCR32rCL, RCR64rCL)>;
813
814def Zn4WriteRotateRightMCL : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4ALU12]> {
815  let Latency = !add(Znver4Model.LoadLatency, Zn4WriteRotateRightRCL.Latency);
816  let ReleaseAtCycles = [1, 1, 8];
817  let NumMicroOps = !add(Zn4WriteRotateRightRCL.NumMicroOps, 2);
818}
819def : InstRW<[Zn4WriteRotateRightMCL], (instrs RCR8mCL, RCR16mCL, RCR32mCL, RCR64mCL)>;
820
821def Zn4WriteRotateLeftRCL : SchedWriteRes<[Zn4ALU12]> {
822  let Latency = 4;
823  let ReleaseAtCycles = [8];
824  let NumMicroOps = 9;
825}
826def : InstRW<[Zn4WriteRotateLeftRCL], (instrs RCL8rCL, RCL16rCL, RCL32rCL, RCL64rCL)>;
827
828def Zn4WriteRotateLeftMCL : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4ALU12]> {
829  let Latency = !add(Znver4Model.LoadLatency, Zn4WriteRotateLeftRCL.Latency);
830  let ReleaseAtCycles = [1, 1, 8];
831  let NumMicroOps = !add(Zn4WriteRotateLeftRCL.NumMicroOps, 2);
832}
833def : InstRW<[Zn4WriteRotateLeftMCL], (instrs RCL8mCL, RCL16mCL, RCL32mCL, RCL64mCL)>;
834
835// Double shift instructions.
836defm : Zn4WriteResInt<WriteSHDrri, [Zn4ALU12], 2, [3], 4>;
837defm : Zn4WriteResInt<WriteSHDrrcl, [Zn4ALU12], 2, [3], 5>;
838defm : Zn4WriteResInt<WriteSHDmri, [Zn4AGU012, Zn4Load, Zn4ALU12], !add(Znver4Model.LoadLatency, 2), [1, 1, 4], 6>;
839defm : Zn4WriteResInt<WriteSHDmrcl, [Zn4AGU012, Zn4Load, Zn4ALU12], !add(Znver4Model.LoadLatency, 2), [1, 1, 4], 6>;
840
841// BMI1 BEXTR/BLS, BMI2 BZHI
842defm : Zn4WriteResIntPair<WriteBEXTR, [Zn4ALU12], 1, [1], 1, /*LoadUOps=*/1>;
843defm : Zn4WriteResIntPair<WriteBLS, [Zn4ALU0123], 1, [1], 1, /*LoadUOps=*/1>;
844defm : Zn4WriteResIntPair<WriteBZHI, [Zn4ALU12], 1, [1], 1, /*LoadUOps=*/1>;
845
846// Idioms that clear a register, like xorps %xmm0, %xmm0.
847// These can often bypass execution ports completely.
848defm : Zn4WriteResInt<WriteZero, [Zn4ALU0123], 0, [0], 1>;
849
850// Branches don't produce values, so they have no latency, but they still
851// consume resources. Indirect branches can fold loads.
852defm : Zn4WriteResIntPair<WriteJump, [Zn4BRU01], 1, [1], 1>; // FIXME: not from llvm-exegesis
853
854// Floating point. This covers both scalar and vector operations.
855defm : Zn4WriteResInt<WriteFLD0, [Zn4FPLd01, Zn4Load, Zn4FP1], !add(Znver4Model.LoadLatency, 4), [1, 1, 1], 1>;
856defm : Zn4WriteResInt<WriteFLD1, [Zn4FPLd01, Zn4Load, Zn4FP1], !add(Znver4Model.LoadLatency, 7), [1, 1, 1], 1>;
857defm : Zn4WriteResInt<WriteFLDC, [Zn4FPLd01, Zn4Load, Zn4FP1], !add(Znver4Model.LoadLatency, 7), [1, 1, 1], 1>;
858defm : Zn4WriteResXMM<WriteFLoad, [Zn4FPLd01, Zn4Load], !add(Znver4Model.VecLoadLatency, 1), [1, 1], 1>;
859defm : Zn4WriteResXMM<WriteFLoadX, [Zn4FPLd01, Zn4Load], !add(Znver4Model.VecLoadLatency, 1), [1, 1], 1>;
860defm : Zn4WriteResYMM<WriteFLoadY, [Zn4FPLd01, Zn4Load], !add(Znver4Model.VecLoadLatency, 1), [1, 1], 1>;
861defm : Zn4WriteResXMM<WriteFMaskedLoad, [Zn4FPLd01, Zn4Load], !add(Znver4Model.VecLoadLatency, 1), [1, 1], 1>;
862defm : Zn4WriteResYMM<WriteFMaskedLoadY, [Zn4FPLd01, Zn4Load], !add(Znver4Model.VecLoadLatency, 1), [1, 1], 1>;
863defm : Zn4WriteResXMM<WriteFStore, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [1, 1], 1>;
864
865def Zn4WriteWriteFStoreMMX : SchedWriteRes<[Zn4FPSt, Zn4Store]> {
866  let Latency = 2; // FIXME: not from llvm-exegesis
867  let ReleaseAtCycles = [1, 1];
868  let NumMicroOps = 2;
869}
870def : InstRW<[Zn4WriteWriteFStoreMMX], (instrs MOVHPDmr,  MOVHPSmr,
871                                               VMOVHPDmr, VMOVHPSmr)>;
872
873defm : Zn4WriteResXMM<WriteFStoreX, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [1, 1], 1>;
874defm : Zn4WriteResYMM<WriteFStoreY, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [1, 1], 1>;
875defm : Zn4WriteResXMM<WriteFStoreNT, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [1, 1], 1>;
876defm : Zn4WriteResXMM<WriteFStoreNTX, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [1, 1], 1>;
877defm : Zn4WriteResYMM<WriteFStoreNTY, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [1, 1], 1>;
878
879defm : Zn4WriteResXMM<WriteFMaskedStore32, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [6, 1], 18>;
880defm : Zn4WriteResXMM<WriteFMaskedStore64, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [4, 1], 10>;
881defm : Zn4WriteResYMM<WriteFMaskedStore32Y, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [12, 1], 42>;
882defm : Zn4WriteResYMM<WriteFMaskedStore64Y, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [6, 1], 18>;
883
884defm : Zn4WriteResXMMPair<WriteFAdd, [Zn4FPFAdd01], 3, [1], 1>;  // Floating point add/sub.
885
886def Zn4WriteX87Arith : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPU0123]> {
887  let Latency = !add(Znver4Model.LoadLatency, 1); // FIXME: not from llvm-exegesis
888  let ReleaseAtCycles = [1, 1, 24];
889  let NumMicroOps = 2;
890}
891def : InstRW<[Zn4WriteX87Arith], (instrs ADD_FI16m, ADD_FI32m,
892                                         SUB_FI16m, SUB_FI32m,
893                                         SUBR_FI16m, SUBR_FI32m,
894                                         MUL_FI16m, MUL_FI32m)>;
895
896def Zn4WriteX87Div : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPU0123]> {
897  let Latency = !add(Znver4Model.LoadLatency, 1); // FIXME: not from llvm-exegesis
898  let ReleaseAtCycles = [1, 1, 62];
899  let NumMicroOps = 2;
900}
901def : InstRW<[Zn4WriteX87Div], (instrs DIV_FI16m, DIV_FI32m,
902                                       DIVR_FI16m, DIVR_FI32m)>;
903
904defm : Zn4WriteResXMMPair<WriteFAddX, [Zn4FPFAdd01], 3, [1], 1>; // Floating point add/sub (XMM).
905defm : Zn4WriteResYMMPair<WriteFAddY, [Zn4FPFAdd01], 3, [1], 1>; // Floating point add/sub (YMM).
906defm : Zn4WriteResZMMPair<WriteFAddZ, [Zn4FPFAdd01], 3, [2], 1>; // Floating point add/sub (ZMM).
907defm : Zn4WriteResXMMPair<WriteFAdd64, [Zn4FPFAdd01], 3, [1], 1>;  // Floating point double add/sub.
908defm : Zn4WriteResXMMPair<WriteFAdd64X, [Zn4FPFAdd01], 3, [1], 1>; // Floating point double add/sub (XMM).
909defm : Zn4WriteResYMMPair<WriteFAdd64Y, [Zn4FPFAdd01], 3, [1], 1>; // Floating point double add/sub (YMM).
910defm : Zn4WriteResZMMPair<WriteFAdd64Z, [Zn4FPFAdd01], 3, [2], 1>; // Floating point double add/sub (ZMM).
911defm : Zn4WriteResXMMPair<WriteFCmp, [Zn4FPFMul01], 2, [2], 1>;  // Floating point compare.
912defm : Zn4WriteResXMMPair<WriteFCmpX, [Zn4FPFMul01], 2, [1], 1>; // Floating point compare (XMM).
913defm : Zn4WriteResYMMPair<WriteFCmpY, [Zn4FPFMul01], 2, [1], 1>; // Floating point compare (YMM).
914defm : Zn4WriteResZMMPair<WriteFCmpZ, [Zn4FPFMul01], 2, [2], 1>; // Floating point compare (ZMM).
915defm : Zn4WriteResXMMPair<WriteFCmp64, [Zn4FPFMul01], 1, [1], 1>;  // Floating point double compare.
916defm : Zn4WriteResXMMPair<WriteFCmp64X, [Zn4FPFMul01], 2, [1], 1>; // Floating point double compare (XMM).
917defm : Zn4WriteResYMMPair<WriteFCmp64Y, [Zn4FPFMul01], 2, [1], 1>; // Floating point double compare (YMM).
918defm : Zn4WriteResZMMPair<WriteFCmp64Z, [Zn4FPFMul01], 2, [2], 1>; // Floating point double compare (ZMM).
919defm : Zn4WriteResXMMPair<WriteFCom, [Zn4FPFMul01], 3, [2], 1>; // FIXME: latency not from llvm-exegesis  // Floating point compare to flags (X87).
920defm : Zn4WriteResXMMPair<WriteFComX, [Zn4FPFMul01], 4, [2], 2>;  // FIXME: latency not from llvm-exegesis // Floating point compare to flags (SSE).
921defm : Zn4WriteResXMMPair<WriteFMul, [Zn4FPFMul01], 3, [1], 1>;  // Floating point multiplication.
922defm : Zn4WriteResXMMPair<WriteFMulX, [Zn4FPFMul01], 3, [1], 1>; // Floating point multiplication (XMM).
923defm : Zn4WriteResYMMPair<WriteFMulY, [Zn4FPFMul01], 3, [1], 1>; // Floating point multiplication (YMM).
924defm : Zn4WriteResZMMPair<WriteFMulZ, [Zn4FPFMul01], 3, [2], 1>; // Floating point multiplication (ZMM).
925defm : Zn4WriteResXMMPair<WriteFMul64, [Zn4FPFMul01], 3, [1], 1>;  // Floating point double multiplication.
926defm : Zn4WriteResXMMPair<WriteFMul64X, [Zn4FPFMul01], 3, [1], 1>; // Floating point double multiplication (XMM).
927defm : Zn4WriteResYMMPair<WriteFMul64Y, [Zn4FPFMul01], 3, [1], 1>; // Floating point double multiplication (YMM).
928defm : Zn4WriteResZMMPair<WriteFMul64Z, [Zn4FPFMul01], 3, [2], 1>; // Floating point double multiplication (ZMM).
929defm : Zn4WriteResXMMPair<WriteFDiv, [Zn4FPFDiv], 11, [3], 1>;  // Floating point division.
930defm : Zn4WriteResXMMPair<WriteFDivX, [Zn4FPFDiv], 11, [3], 1>; // Floating point division (XMM).
931defm : Zn4WriteResYMMPair<WriteFDivY, [Zn4FPFDiv], 11, [3], 1>; // Floating point division (YMM).
932defm : Zn4WriteResZMMPair<WriteFDivZ, [Zn4FPFDiv], 11, [6], 1>; // Floating point division (ZMM).
933defm : Zn4WriteResXMMPair<WriteFDiv64, [Zn4FPFDiv], 13, [5], 1>;  // Floating point double division.
934defm : Zn4WriteResXMMPair<WriteFDiv64X, [Zn4FPFDiv], 13, [5], 1>; // Floating point double division (XMM).
935defm : Zn4WriteResYMMPair<WriteFDiv64Y, [Zn4FPFDiv], 13, [5], 1>; // Floating point double division (YMM).
936defm : Zn4WriteResZMMPair<WriteFDiv64Z, [Zn4FPFDiv], 13, [10], 1>; // Floating point double division (ZMM).
937defm : Zn4WriteResXMMPair<WriteFSqrt, [Zn4FPFDiv], 15, [5], 1>;   // Floating point square root.
938defm : Zn4WriteResXMMPair<WriteFSqrtX, [Zn4FPFDiv], 15, [5], 1>;  // Floating point square root (XMM).
939defm : Zn4WriteResYMMPair<WriteFSqrtY, [Zn4FPFDiv], 15, [5], 1>;  // Floating point square root (YMM).
940defm : Zn4WriteResZMMPair<WriteFSqrtZ, [Zn4FPFDiv], 15, [10], 1>;  // Floating point square root (ZMM).
941defm : Zn4WriteResXMMPair<WriteFSqrt64, [Zn4FPFDiv], 21, [9], 1>;  // Floating point double square root.
942defm : Zn4WriteResXMMPair<WriteFSqrt64X, [Zn4FPFDiv], 21, [9], 1>; // Floating point double square root (XMM).
943defm : Zn4WriteResYMMPair<WriteFSqrt64Y, [Zn4FPFDiv], 21, [9], 1>; // Floating point double square root (YMM).
944defm : Zn4WriteResZMMPair<WriteFSqrt64Z, [Zn4FPFDiv], 21, [18], 1>; // Floating point double square root (ZMM).
945defm : Zn4WriteResXMMPair<WriteFSqrt80, [Zn4FPFDiv], 22, [23], 1>; // FIXME: latency not from llvm-exegesis  // Floating point long double square root.
946defm : Zn4WriteResXMMPair<WriteFRcp, [Zn4FPFMul01], 4, [1], 1>;  // Floating point reciprocal estimate.
947defm : Zn4WriteResXMMPair<WriteFRcpX, [Zn4FPFMul01], 4, [1], 1>; // Floating point reciprocal estimate (XMM).
948defm : Zn4WriteResYMMPair<WriteFRcpY, [Zn4FPFMul01], 5, [1], 1>; // Floating point reciprocal estimate (YMM).
949defm : Zn4WriteResZMMPair<WriteFRcpZ, [Zn4FPFMul01], 5, [2], 1>; // Floating point reciprocal estimate (ZMM).
950defm : Zn4WriteResXMMPair<WriteFRsqrt, [Zn4FPFDiv], 4, [1], 1>;  // Floating point reciprocal square root estimate.
951defm : Zn4WriteResXMMPair<WriteFRsqrtX, [Zn4FPFDiv], 4, [1], 1>; // Floating point reciprocal square root estimate (XMM).
952defm : Zn4WriteResYMMPair<WriteFRsqrtY, [Zn4FPFDiv], 4, [1], 1>; // Floating point reciprocal square root estimate (YMM).
953defm : Zn4WriteResZMMPair<WriteFRsqrtZ, [Zn4FPFDiv], 5, [2], 1>; // Floating point reciprocal square root estimate (ZMM).
954defm : Zn4WriteResXMMPair<WriteFMA, [Zn4FPFMul01], 4, [2], 1>;  // Fused Multiply Add.
955defm : Zn4WriteResXMMPair<WriteFMAX, [Zn4FPFMul01], 4, [1], 1>; // Fused Multiply Add (XMM).
956defm : Zn4WriteResYMMPair<WriteFMAY, [Zn4FPFMul01], 4, [1], 1>; // Fused Multiply Add (YMM).
957defm : Zn4WriteResZMMPair<WriteFMAZ, [Zn4FPFMul01], 4, [2], 1>; // Fused Multiply Add (ZMM).
958defm : Zn4WriteResXMMPair<WriteDPPD, [Zn4FPFMul01], 7, [6], 3, /*LoadUOps=*/2>; // Floating point double dot product.
959defm : Zn4WriteResXMMPair<WriteDPPS, [Zn4FPFMul01], 11, [8], 8, /*LoadUOps=*/2>; // Floating point single dot product.
960defm : Zn4WriteResYMMPair<WriteDPPSY, [Zn4FPFMul01], 11, [8], 7, /*LoadUOps=*/1>; // Floating point single dot product (YMM).
961defm : Zn4WriteResXMMPair<WriteFSign, [Zn4FPFMul01], 1, [2], 1>; // FIXME: latency not from llvm-exegesis  // Floating point fabs/fchs.
962defm : Zn4WriteResXMMPair<WriteFRnd, [Zn4FPFCvt01], 3, [1], 1>; // Floating point rounding.
963defm : Zn4WriteResYMMPair<WriteFRndY, [Zn4FPFCvt01], 3, [1], 1>; // Floating point rounding (YMM).
964defm : Zn4WriteResZMMPair<WriteFRndZ, [Zn4FPFCvt01], 3, [2], 1>; // Floating point rounding (ZMM).
965
966defm : Zn4WriteResXMMPair<WriteFLogic, [Zn4FPVMisc0123], 1, [1], 1>; // Floating point and/or/xor logicals.
967defm : Zn4WriteResYMMPair<WriteFLogicY, [Zn4FPVMisc0123], 1, [1], 1>; // Floating point and/or/xor logicals (YMM).
968defm : Zn4WriteResZMMPair<WriteFLogicZ, [Zn4FPVMisc0123], 1, [2], 1>; // Floating point and/or/xor logicals (ZMM).
969defm : Zn4WriteResXMMPair<WriteFTest, [Zn4FPFMisc12], 1, [2], 2>; // FIXME: latency not from llvm-exegesis // Floating point TEST instructions.
970defm : Zn4WriteResYMMPair<WriteFTestY, [Zn4FPFMisc12], 1, [2], 2>; // FIXME: latency not from llvm-exegesis // Floating point TEST instructions (YMM).
971defm : Zn4WriteResZMMPair<WriteFTestZ, [Zn4FPFMisc12], 1, [4], 1>; // FIXME: latency not from llvm-exegesis // Floating point TEST instructions (ZMM).
972defm : Zn4WriteResXMMPair<WriteFShuffle, [Zn4FPVShuf01], 1, [1], 1>; // Floating point vector shuffles.
973defm : Zn4WriteResYMMPair<WriteFShuffleY, [Zn4FPVShuf01], 1, [1], 1>; // Floating point vector shuffles (YMM).
974defm : Zn4WriteResZMMPair<WriteFShuffleZ, [Zn4FPVShuf01], 1, [2], 1>; // Floating point vector shuffles (ZMM).
975defm : Zn4WriteResXMMPair<WriteFVarShuffle, [Zn4FPVShuf01], 3, [1], 1>; // Floating point vector variable shuffles.
976defm : Zn4WriteResYMMPair<WriteFVarShuffleY, [Zn4FPVShuf01], 3, [1], 1>; // Floating point vector variable shuffles (YMM).
977defm : Zn4WriteResZMMPair<WriteFVarShuffleZ, [Zn4FPVShuf01], 3, [2], 1>; // Floating point vector variable shuffles (ZMM).
978defm : Zn4WriteResXMMPair<WriteFBlend, [Zn4FPFMul01], 1, [1], 1>; // Floating point vector blends.
979defm : Zn4WriteResYMMPair<WriteFBlendY, [Zn4FPFMul01], 1, [1], 1>; // Floating point vector blends (YMM).
980defm : Zn4WriteResZMMPair<WriteFBlendZ, [Zn4FPFMul01], 1, [2], 1>; // Floating point vector blends (ZMM).
981defm : Zn4WriteResXMMPair<WriteFVarBlend, [Zn4FPFMul01], 1, [1], 1>; // Fp vector variable blends.
982defm : Zn4WriteResYMMPair<WriteFVarBlendY, [Zn4FPFMul01], 1, [1], 1>; // Fp vector variable blends (YMM).
983defm : Zn4WriteResZMMPair<WriteFVarBlendZ, [Zn4FPFMul01], 1, [2], 1>; // Fp vector variable blends (ZMM).
984
985// Horizontal Add/Sub (float and integer)
986defm : Zn4WriteResXMMPair<WriteFHAdd, [Zn4FPFAdd0], 4, [2], 3>;
987defm : Zn4WriteResYMMPair<WriteFHAddY, [Zn4FPFAdd0], 4, [2], 3, /*LoadUOps=*/1>;
988defm : Zn4WriteResZMMPair<WriteFHAddZ, [Zn4FPFAdd0], 6, [4], 3, /*LoadUOps=*/1>;
989defm : Zn4WriteResXMMPair<WritePHAdd, [Zn4FPVAdd0], 2, [2], 3, /*LoadUOps=*/1>;
990defm : Zn4WriteResXMMPair<WritePHAddX, [Zn4FPVAdd0], 2, [2], 3>;
991defm : Zn4WriteResYMMPair<WritePHAddY, [Zn4FPVAdd0], 3, [3], 3, /*LoadUOps=*/1>;
992defm : Zn4WriteResZMMPair<WritePHAddZ, [Zn4FPVAdd0], 2, [4], 3, /*LoadUOps=*/1>;
993
994// Vector integer operations.
995defm : Zn4WriteResXMM<WriteVecLoad, [Zn4FPLd01, Zn4Load], !add(Znver4Model.VecLoadLatency, 1), [1, 1], 1>;
996defm : Zn4WriteResXMM<WriteVecLoadX, [Zn4FPLd01, Zn4Load], !add(Znver4Model.VecLoadLatency, 1), [1, 1], 1>;
997defm : Zn4WriteResYMM<WriteVecLoadY, [Zn4FPLd01, Zn4Load], !add(Znver4Model.VecLoadLatency, 1), [1, 1], 1>;
998defm : Zn4WriteResXMM<WriteVecLoadNT, [Zn4FPLd01, Zn4Load], !add(Znver4Model.VecLoadLatency, 1), [1, 1], 1>;
999defm : Zn4WriteResYMM<WriteVecLoadNTY, [Zn4FPLd01, Zn4Load], !add(Znver4Model.VecLoadLatency, 1), [1, 1], 1>;
1000defm : Zn4WriteResXMM<WriteVecMaskedLoad, [Zn4FPLd01, Zn4Load], !add(Znver4Model.VecLoadLatency, 1), [1, 1], 1>;
1001defm : Zn4WriteResYMM<WriteVecMaskedLoadY, [Zn4FPLd01, Zn4Load], !add(Znver4Model.VecLoadLatency, 1), [1, 1], 1>;
1002defm : Zn4WriteResXMM<WriteVecStore, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [1, 1], 1>;
1003defm : Zn4WriteResXMM<WriteVecStoreX, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [1, 1], 1>;
1004
1005def Zn4WriteVEXTRACTF128rr_VEXTRACTI128rr : SchedWriteRes<[Zn4FPFMisc0]> {
1006  let Latency = 4;
1007  let ReleaseAtCycles = [1];
1008  let NumMicroOps = 1;
1009}
1010def : InstRW<[Zn4WriteVEXTRACTF128rr_VEXTRACTI128rr], (instrs VEXTRACTF128rr, VEXTRACTI128rr)>;
1011
1012def Zn4WriteVEXTRACTI128mr : SchedWriteRes<[Zn4FPFMisc0, Zn4FPSt, Zn4Store]> {
1013  let Latency = !add(Znver4Model.LoadLatency, Zn4WriteVEXTRACTF128rr_VEXTRACTI128rr.Latency);
1014  let ReleaseAtCycles = [1, 1, 1];
1015  let NumMicroOps = !add(Zn4WriteVEXTRACTF128rr_VEXTRACTI128rr.NumMicroOps, 1);
1016}
1017def : InstRW<[Zn4WriteVEXTRACTI128mr], (instrs VEXTRACTI128mr, VEXTRACTF128mr)>;
1018
1019def Zn4WriteVINSERTF128rmr : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPFMisc0]> {
1020  let Latency = !add(Znver4Model.LoadLatency, Zn4WriteVEXTRACTF128rr_VEXTRACTI128rr.Latency);
1021  let ReleaseAtCycles = [1, 1, 1];
1022  let NumMicroOps = !add(Zn4WriteVEXTRACTF128rr_VEXTRACTI128rr.NumMicroOps, 0);
1023}
1024def : InstRW<[Zn4WriteVINSERTF128rmr], (instrs VINSERTF128rm)>;
1025
1026defm : Zn4WriteResYMM<WriteVecStoreY, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [1, 1], 1>;
1027defm : Zn4WriteResXMM<WriteVecStoreNT, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [1, 1], 1>;
1028defm : Zn4WriteResYMM<WriteVecStoreNTY, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [1, 1], 1>;
1029defm : Zn4WriteResXMM<WriteVecMaskedStore32, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [6, 1], 18>;
1030defm : Zn4WriteResXMM<WriteVecMaskedStore64, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [4, 1], 10>;
1031defm : Zn4WriteResYMM<WriteVecMaskedStore32Y, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [12, 1], 42>;
1032defm : Zn4WriteResYMM<WriteVecMaskedStore64Y, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [6, 1], 18>;
1033
1034defm : Zn4WriteResXMM<WriteVecMoveToGpr, [Zn4FPLd01], 1, [2], 1>;
1035defm : Zn4WriteResXMM<WriteVecMoveFromGpr, [Zn4FPLd01], 1, [2], 1>;
1036
1037def Zn4WriteMOVMMX : SchedWriteRes<[Zn4FPLd01, Zn4FPFMisc0123]> {
1038  let Latency = 1;
1039  let ReleaseAtCycles = [1, 2];
1040  let NumMicroOps = 2;
1041}
1042def : InstRW<[Zn4WriteMOVMMX], (instrs MMX_MOVQ2FR64rr, MMX_MOVQ2DQrr)>;
1043
1044def Zn4WriteMOVMMXSlow : SchedWriteRes<[Zn4FPLd01, Zn4FPFMisc0123]> {
1045  let Latency = 1;
1046  let ReleaseAtCycles = [1, 4];
1047  let NumMicroOps = 2;
1048}
1049def : InstRW<[Zn4WriteMOVMMXSlow], (instrs MMX_MOVD64rr, MMX_MOVD64to64rr)>;
1050
1051defm : Zn4WriteResXMMPair<WriteVecALU, [Zn4FPVAdd0123], 1, [1], 1>;  // Vector integer ALU op, no logicals.
1052
1053def Zn4WriteEXTRQ_INSERTQ : SchedWriteRes<[Zn4FPVShuf01, Zn4FPLd01]> {
1054  let Latency = 3;
1055  let ReleaseAtCycles = [1, 1];
1056  let NumMicroOps = 1;
1057}
1058def : InstRW<[Zn4WriteEXTRQ_INSERTQ], (instrs EXTRQ, INSERTQ)>;
1059
1060def Zn4WriteEXTRQI_INSERTQI : SchedWriteRes<[Zn4FPVShuf01, Zn4FPLd01]> {
1061  let Latency = 3;
1062  let ReleaseAtCycles = [1, 1];
1063  let NumMicroOps = 2;
1064}
1065def : InstRW<[Zn4WriteEXTRQI_INSERTQI], (instrs EXTRQI, INSERTQI)>;
1066
1067defm : Zn4WriteResXMMPair<WriteVecALUX, [Zn4FPVAdd0123], 1, [1], 1>; // Vector integer ALU op, no logicals (XMM).
1068
1069def Zn4WriteVecALUXSlow : SchedWriteRes<[Zn4FPVAdd01]> {
1070  let Latency = 2;
1071  let ReleaseAtCycles = [2];
1072  let NumMicroOps = 1;
1073}
1074def : InstRW<[Zn4WriteVecALUXSlow], (instrs PABSBrr, PABSDrr, PABSWrr,
1075                                            PADDSBrr, PADDSWrr, PADDUSBrr, PADDUSWrr,
1076                                            PAVGBrr, PAVGWrr,
1077                                            PSIGNBrr, PSIGNDrr, PSIGNWrr,
1078                                            VPABSBrr, VPABSDrr, VPABSWrr,
1079                                            VPADDSBrr, VPADDSWrr, VPADDUSBrr, VPADDUSWrr,
1080                                            VPAVGBrr, VPAVGWrr,
1081                                            VPCMPEQQrr,
1082                                            VPSIGNBrr, VPSIGNDrr, VPSIGNWrr,
1083                                            PSUBSBrr, PSUBSWrr, PSUBUSBrr, PSUBUSWrr, VPSUBSBrr, VPSUBSWrr, VPSUBUSBrr, VPSUBUSWrr)>;
1084
1085def Zn4WriteVecOpMask : SchedWriteRes<[Zn4FPOpMask01]> {
1086  let Latency = 1;
1087  let ReleaseAtCycles = [1];
1088  let NumMicroOps = 1;
1089}
1090def : InstRW<[Zn4WriteVecOpMask], (instrs   KADDBrr, KADDDrr, KADDQrr, KADDWrr,
1091                                            KANDBrr, KANDDrr, KANDQrr, KANDWrr,
1092                                            KANDNBrr, KANDNDrr, KANDNQrr, KANDNWrr,
1093                                            KMOVBkk, KMOVDkk, KMOVQkk, KMOVWkk,
1094                                            KMOVBrk, KMOVDrk, KMOVQrk, KMOVWrk,
1095                                            KNOTBrr, KNOTDrr, KNOTQrr, KNOTWrr,
1096                                            KORBrr, KORDrr, KORQrr, KORWrr,
1097                                            KORTESTBrr, KORTESTDrr, KORTESTQrr, KORTESTWrr,
1098                                            KTESTBrr, KTESTDrr, KTESTQrr, KTESTWrr,
1099                                            KUNPCKBWrr, KUNPCKDQrr, KUNPCKWDrr,
1100                                            KXNORBrr, KXNORDrr, KXNORQrr, KXNORWrr,
1101                                            KXORBrr, KXORDrr, KXORQrr, KXORWrr)>;
1102
1103def Zn4WriteVecOpMaskMemMov : SchedWriteRes<[Zn4FPOpMask4]> {
1104  let Latency = 1;
1105  let ReleaseAtCycles = [1];
1106  let NumMicroOps = 1;
1107}
1108def : InstRW<[Zn4WriteVecOpMaskMemMov], (instrs KMOVBmk, KMOVDmk, KMOVQmk, KMOVWmk)>;
1109
1110def Zn4WriteVecOpMaskKRMov : SchedWriteRes<[Zn4FPOpMask4]> {
1111  let Latency = 1;
1112  let ReleaseAtCycles = [1];
1113  let NumMicroOps = 1;
1114}
1115def : InstRW<[Zn4WriteVecOpMaskKRMov], (instrs KMOVBkr, KMOVDkr, KMOVQkr, KMOVWkr)>;
1116
1117def Zn4WriteVecALU2Slow : SchedWriteRes<[Zn4FPVAdd12]> {
1118  // TODO: All align instructions are expected to be of 4 cycle latency
1119  let Latency = 4;
1120  let ReleaseAtCycles = [1];
1121  let NumMicroOps = 1;
1122}
1123def : InstRW<[Zn4WriteVecALU2Slow], (instrs VALIGNDZrri, VALIGNDZ128rri, VALIGNDZ256rri,
1124                                            VALIGNQZrri, VALIGNQZ128rri, VALIGNQZ256rri)
1125                                            >;
1126defm : Zn4WriteResYMMPair<WriteVecALUY, [Zn4FPVAdd0123], 1, [1], 1>; // Vector integer ALU op, no logicals (YMM).
1127
1128def Zn4WriteVecALUYSlow : SchedWriteRes<[Zn4FPVAdd01]> {
1129  let Latency = 1;
1130  let ReleaseAtCycles = [1];
1131  let NumMicroOps = 1;
1132}
1133def : InstRW<[Zn4WriteVecALUYSlow], (instrs VPABSBYrr, VPABSDYrr, VPABSWYrr,
1134                                            VPADDSBYrr, VPADDSWYrr, VPADDUSBYrr, VPADDUSWYrr,
1135                                            VPSUBSBYrr, VPSUBSWYrr, VPSUBUSBYrr, VPSUBUSWYrr,
1136                                            VPAVGBYrr, VPAVGWYrr,
1137                                            VPCMPEQQYrr,
1138                                            VPSIGNBYrr, VPSIGNDYrr, VPSIGNWYrr)>;
1139
1140defm : Zn4WriteResZMMPair<WriteVecALUZ, [Zn4FPVAdd0123], 1, [2], 1>; // Vector integer ALU op, no logicals (ZMM).
1141
1142defm : Zn4WriteResXMMPair<WriteVecLogic, [Zn4FPVMisc0123], 1, [1], 1>;  // Vector integer and/or/xor logicals.
1143defm : Zn4WriteResXMMPair<WriteVecLogicX, [Zn4FPVMisc0123], 1, [1], 1>; // Vector integer and/or/xor logicals (XMM).
1144defm : Zn4WriteResYMMPair<WriteVecLogicY, [Zn4FPVMisc0123], 1, [1], 1>; // Vector integer and/or/xor logicals (YMM).
1145defm : Zn4WriteResZMMPair<WriteVecLogicZ, [Zn4FPVMisc0123], 1, [2], 1>; // Vector integer and/or/xor logicals (ZMM).
1146defm : Zn4WriteResXMMPair<WriteVecTest, [Zn4FPVAdd12, Zn4FPSt], 1, [1, 1], 2>;  // FIXME: latency not from llvm-exegesis // Vector integer TEST instructions.
1147defm : Zn4WriteResYMMPair<WriteVecTestY, [Zn4FPVAdd12, Zn4FPSt], 1, [1, 1], 2>; // FIXME: latency not from llvm-exegesis  // Vector integer TEST instructions (YMM).
1148defm : Zn4WriteResZMMPair<WriteVecTestZ, [Zn4FPVAdd12, Zn4FPSt], 1, [2, 2], 2>; // FIXME: latency not from llvm-exegesis  // Vector integer TEST instructions (ZMM).
1149defm : Zn4WriteResXMMPair<WriteVecShift, [Zn4FPVShift01], 1, [1], 1>;  // Vector integer shifts (default).
1150defm : Zn4WriteResXMMPair<WriteVecShiftX, [Zn4FPVShift01], 2, [2], 1>; // Vector integer shifts (XMM).
1151defm : Zn4WriteResYMMPair<WriteVecShiftY, [Zn4FPVShift01], 1, [1], 1>; // Vector integer shifts (YMM).
1152defm : Zn4WriteResZMMPair<WriteVecShiftZ, [Zn4FPVShift01], 1, [2], 1>; // Vector integer shifts (ZMM).
1153defm : Zn4WriteResXMMPair<WriteVecShiftImm, [Zn4FPVShift01], 1, [1], 1>;  // Vector integer immediate shifts (default).
1154defm : Zn4WriteResXMMPair<WriteVecShiftImmX, [Zn4FPVShift01], 1, [1], 1>; // Vector integer immediate shifts (XMM).
1155defm : Zn4WriteResYMMPair<WriteVecShiftImmY, [Zn4FPVShift01], 1, [1], 1>; // Vector integer immediate shifts (YMM).
1156defm : Zn4WriteResZMMPair<WriteVecShiftImmZ, [Zn4FPVShift01], 1, [2], 1>; // Vector integer immediate shifts (ZMM).
1157defm : Zn4WriteResXMMPair<WriteVecIMul, [Zn4FPVMul01], 3, [1], 1>;  // Vector integer multiply (default).
1158defm : Zn4WriteResXMMPair<WriteVecIMulX, [Zn4FPVMul01], 3, [1], 1>; // Vector integer multiply (XMM).
1159defm : Zn4WriteResYMMPair<WriteVecIMulY, [Zn4FPVMul01], 3, [1], 1>; // Vector integer multiply (YMM).
1160defm : Zn4WriteResZMMPair<WriteVecIMulZ, [Zn4FPVMul01], 3, [2], 1>; // Vector integer multiply (ZMM).
1161defm : Zn4WriteResXMMPair<WritePMULLD, [Zn4FPVMul01], 3, [1], 1>; // Vector PMULLD.
1162defm : Zn4WriteResYMMPair<WritePMULLDY, [Zn4FPVMul01], 3, [1], 1>; // Vector PMULLD (YMM).
1163defm : Zn4WriteResZMMPair<WritePMULLDZ, [Zn4FPVMul01], 3, [2], 1>; // Vector PMULLD (ZMM).
1164defm : Zn4WriteResXMMPair<WriteShuffle, [Zn4FPVShuf01], 1, [1], 1>;  // Vector shuffles.
1165defm : Zn4WriteResXMMPair<WriteShuffleX, [Zn4FPVShuf01], 1, [1], 1>; // Vector shuffles (XMM).
1166defm : Zn4WriteResYMMPair<WriteShuffleY, [Zn4FPVShuf01], 1, [1], 1>; // Vector shuffles (YMM).
1167defm : Zn4WriteResZMMPair<WriteShuffleZ, [Zn4FPVShuf01], 1, [2], 1>; // Vector shuffles (ZMM).
1168defm : Zn4WriteResXMMPair<WriteVarShuffle, [Zn4FPVShuf01], 1, [1], 1>;  // Vector variable shuffles.
1169defm : Zn4WriteResXMMPair<WriteVarShuffleX, [Zn4FPVShuf01], 1, [1], 1>; // Vector variable shuffles (XMM).
1170defm : Zn4WriteResYMMPair<WriteVarShuffleY, [Zn4FPVShuf01], 1, [1], 1>; // Vector variable shuffles (YMM).
1171defm : Zn4WriteResZMMPair<WriteVarShuffleZ, [Zn4FPVShuf01], 1, [2], 1>; // Vector variable shuffles (ZMM).
1172defm : Zn4WriteResXMMPair<WriteBlend, [Zn4FPVMisc0123], 1, [1], 1>; // Vector blends.
1173defm : Zn4WriteResYMMPair<WriteBlendY, [Zn4FPVMisc0123], 1, [1], 1>; // Vector blends (YMM).
1174defm : Zn4WriteResZMMPair<WriteBlendZ, [Zn4FPVMisc0123], 1, [2], 1>; // Vector blends (ZMM).
1175defm : Zn4WriteResXMMPair<WriteVarBlend, [Zn4FPVMul01], 1, [1], 1>; // Vector variable blends.
1176defm : Zn4WriteResYMMPair<WriteVarBlendY, [Zn4FPVMul01], 1, [1], 1>; // Vector variable blends (YMM).
1177defm : Zn4WriteResZMMPair<WriteVarBlendZ, [Zn4FPVMul01], 1, [2], 1>; // Vector variable blends (ZMM).
1178defm : Zn4WriteResXMMPair<WritePSADBW, [Zn4FPVAdd0123], 3, [2], 1>;  // Vector PSADBW.
1179defm : Zn4WriteResXMMPair<WritePSADBWX, [Zn4FPVAdd0123], 3, [2], 1>; // Vector PSADBW (XMM).
1180defm : Zn4WriteResYMMPair<WritePSADBWY, [Zn4FPVAdd0123], 3, [2], 1>; // Vector PSADBW (YMM).
1181defm : Zn4WriteResZMMPair<WritePSADBWZ, [Zn4FPVAdd0123], 4, [4], 1>; // Vector PSADBW (ZMM).
1182defm : Zn4WriteResXMMPair<WriteMPSAD, [Zn4FPVAdd0123], 4, [8], 4, /*LoadUOps=*/2>; // Vector MPSAD.
1183defm : Zn4WriteResYMMPair<WriteMPSADY, [Zn4FPVAdd0123], 4, [8], 3, /*LoadUOps=*/1>; // Vector MPSAD (YMM).
1184defm : Zn4WriteResZMMPair<WriteMPSADZ, [Zn4FPVAdd0123], 4, [16], 3, /*LoadUOps=*/1>; // Vector MPSAD (ZMM).
1185defm : Zn4WriteResXMMPair<WritePHMINPOS, [Zn4FPVAdd01], 3, [1], 1>;  // Vector PHMINPOS.
1186
1187// Vector insert/extract operations.
1188defm : Zn4WriteResXMMPair<WriteVecInsert, [Zn4FPLd01], 1, [2], 2, /*LoadUOps=*/-1>; // Insert gpr to vector element.
1189defm : Zn4WriteResXMM<WriteVecExtract, [Zn4FPLd01], 1, [2], 2>; // Extract vector element to gpr.
1190defm : Zn4WriteResXMM<WriteVecExtractSt, [Zn4FPSt, Zn4Store], !add(1, Znver4Model.StoreLatency), [1, 1], 2>; // Extract vector element and store.
1191
1192// MOVMSK operations.
1193defm : Zn4WriteResXMM<WriteFMOVMSK, [Zn4FPVMisc2], 1, [1], 1>;
1194defm : Zn4WriteResXMM<WriteVecMOVMSK, [Zn4FPVMisc2], 1, [1], 1>;
1195defm : Zn4WriteResYMM<WriteVecMOVMSKY, [Zn4FPVMisc2], 1, [1], 1>;
1196defm : Zn4WriteResXMM<WriteMMXMOVMSK, [Zn4FPVMisc2], 1, [1], 1>;
1197
1198// Conversion between integer and float.
1199defm : Zn4WriteResXMMPair<WriteCvtSD2I, [Zn4FPFCvt01], 1, [1], 1>;  // Double -> Integer.
1200defm : Zn4WriteResXMMPair<WriteCvtPD2I, [Zn4FPFCvt01], 3, [2], 1>; // Double -> Integer (XMM).
1201defm : Zn4WriteResYMMPair<WriteCvtPD2IY, [Zn4FPFCvt01], 3, [2], 2>; // Double -> Integer (YMM).
1202defm : Zn4WriteResZMMPair<WriteCvtPD2IZ, [Zn4FPFCvt01], 3, [4], 2>; // Double -> Integer (ZMM).
1203
1204def Zn4WriteCvtPD2IMMX : SchedWriteRes<[Zn4FPFCvt01]> {
1205  let Latency = 1;
1206  let ReleaseAtCycles = [2];
1207  let NumMicroOps = 2;
1208}
1209defm : Zn4WriteResXMMPair<WriteCvtSS2I, [Zn4FPFCvt01], 5, [5], 2>;  // Float -> Integer.
1210
1211defm : Zn4WriteResXMMPair<WriteCvtPS2I, [Zn4FPFCvt01], 3, [1], 1>; // Float -> Integer (XMM).
1212defm : Zn4WriteResYMMPair<WriteCvtPS2IY, [Zn4FPFCvt01], 4, [1], 1>; // Float -> Integer (YMM).
1213defm : Zn4WriteResZMMPair<WriteCvtPS2IZ, [Zn4FPFCvt01], 4, [2], 2>; // Float -> Integer (ZMM).
1214
1215defm : Zn4WriteResXMMPair<WriteCvtI2SD, [Zn4FPFCvt01], 4, [2], 2, /*LoadUOps=*/-1>;  // Integer -> Double.
1216defm : Zn4WriteResXMMPair<WriteCvtI2PD, [Zn4FPFCvt01], 3, [1], 1>; // Integer -> Double (XMM).
1217defm : Zn4WriteResYMMPair<WriteCvtI2PDY, [Zn4FPFCvt01], 3, [2], 2, /*LoadUOps=*/-1>; // Integer -> Double (YMM).
1218defm : Zn4WriteResZMMPair<WriteCvtI2PDZ, [Zn4FPFCvt01], 4, [4], 4, /*LoadUOps=*/-1>; // Integer -> Double (ZMM).
1219
1220def Zn4WriteCvtI2PDMMX : SchedWriteRes<[Zn4FPFCvt01]> {
1221  let Latency = 2;
1222  let ReleaseAtCycles = [6];
1223  let NumMicroOps = 2;
1224}
1225
1226defm : Zn4WriteResXMMPair<WriteCvtI2SS, [Zn4FPFCvt01], 3, [2], 2, /*LoadUOps=*/-1>;  // Integer -> Float.
1227defm : Zn4WriteResXMMPair<WriteCvtI2PS, [Zn4FPFCvt01], 3, [1], 1>; // Integer -> Float (XMM).
1228defm : Zn4WriteResYMMPair<WriteCvtI2PSY, [Zn4FPFCvt01], 3, [1], 1>; // Integer -> Float (YMM).
1229defm : Zn4WriteResZMMPair<WriteCvtI2PSZ, [Zn4FPFCvt01], 3, [2], 2>; // Integer -> Float (ZMM).
1230
1231def Zn4WriteCvtI2PSMMX : SchedWriteRes<[Zn4FPFCvt01]> {
1232  let Latency = 3;
1233  let ReleaseAtCycles = [1];
1234  let NumMicroOps = 2;
1235}
1236
1237defm : Zn4WriteResXMMPair<WriteCvtSS2SD, [Zn4FPFCvt01], 3, [1], 1>;  // Float -> Double size conversion.
1238defm : Zn4WriteResXMMPair<WriteCvtPS2PD, [Zn4FPFCvt01], 3, [1], 1>; // Float -> Double size conversion (XMM).
1239defm : Zn4WriteResYMMPair<WriteCvtPS2PDY, [Zn4FPFCvt01], 4, [2], 2, /*LoadUOps=*/-1>; // Float -> Double size conversion (YMM).
1240defm : Zn4WriteResZMMPair<WriteCvtPS2PDZ, [Zn4FPFCvt01], 6, [4], 4, /*LoadUOps=*/-1>; // Float -> Double size conversion (ZMM).
1241
1242defm : Zn4WriteResXMMPair<WriteCvtSD2SS, [Zn4FPFCvt01], 3, [1], 1>;  // Double -> Float size conversion.
1243defm : Zn4WriteResXMMPair<WriteCvtPD2PS, [Zn4FPFCvt01], 3, [1], 1>; // Double -> Float size conversion (XMM).
1244defm : Zn4WriteResYMMPair<WriteCvtPD2PSY, [Zn4FPFCvt01], 6, [2], 2>; // Double -> Float size conversion (YMM).
1245defm : Zn4WriteResZMMPair<WriteCvtPD2PSZ, [Zn4FPFCvt01], 6, [4], 4>; // Double -> Float size conversion (ZMM).
1246
1247defm : Zn4WriteResXMMPair<WriteCvtPH2PS, [Zn4FPFCvt01], 3, [1], 1>; // Half -> Float size conversion.
1248defm : Zn4WriteResYMMPair<WriteCvtPH2PSY, [Zn4FPFCvt01], 4, [2], 2, /*LoadUOps=*/-1>; // Half -> Float size conversion (YMM).
1249defm : Zn4WriteResZMMPair<WriteCvtPH2PSZ, [Zn4FPFCvt01], 4, [4], 4, /*LoadUOps=*/-1>; // Half -> Float size conversion (ZMM).
1250
1251defm : Zn4WriteResXMM<WriteCvtPS2PH, [Zn4FPFCvt01], 3, [2], 1>; // Float -> Half size conversion.
1252defm : Zn4WriteResYMM<WriteCvtPS2PHY, [Zn4FPFCvt01], 6, [2], 2>; // Float -> Half size conversion (YMM).
1253defm : Zn4WriteResZMM<WriteCvtPS2PHZ, [Zn4FPFCvt01], 6, [2], 2>; // Float -> Half size conversion (ZMM).
1254
1255defm : Zn4WriteResXMM<WriteCvtPS2PHSt, [Zn4FPFCvt01, Zn4FPSt, Zn4Store], !add(3, Znver4Model.StoreLatency), [1, 1, 1], 2>; // Float -> Half + store size conversion.
1256defm : Zn4WriteResYMM<WriteCvtPS2PHYSt, [Zn4FPFCvt01, Zn4FPSt, Zn4Store], !add(6, Znver4Model.StoreLatency), [2, 1, 1], 3>; // Float -> Half + store size conversion (YMM).
1257defm : Zn4WriteResYMM<WriteCvtPS2PHZSt, [Zn4FPFCvt01, Zn4FPSt, Zn4Store], !add(6, Znver4Model.StoreLatency), [2, 1, 1], 3>; // Float -> Half + store size conversion (ZMM).
1258
1259// CRC32 instruction.
1260defm : Zn4WriteResIntPair<WriteCRC32, [Zn4ALU1], 3, [1], 1>;
1261
1262def Zn4WriteSHA1MSG1rr : SchedWriteRes<[Zn4FPU0123]> {
1263  let Latency = 2;
1264  let ReleaseAtCycles = [2];
1265  let NumMicroOps = 2;
1266}
1267def : InstRW<[Zn4WriteSHA1MSG1rr], (instrs SHA1MSG1rr)>;
1268
1269def Zn4WriteSHA1MSG1rm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPU0123]> {
1270  let Latency = !add(Znver4Model.LoadLatency, Zn4WriteSHA1MSG1rr.Latency);
1271  let ReleaseAtCycles = [1, 1, 2];
1272  let NumMicroOps = !add(Zn4WriteSHA1MSG1rr.NumMicroOps, 0);
1273}
1274def : InstRW<[Zn4WriteSHA1MSG1rm], (instrs SHA1MSG1rm)>;
1275
1276def Zn4WriteSHA1MSG2rr_SHA1NEXTErr : SchedWriteRes<[Zn4FPU0123]> {
1277  let Latency = 1;
1278  let ReleaseAtCycles = [2];
1279  let NumMicroOps = 1;
1280}
1281def : InstRW<[Zn4WriteSHA1MSG2rr_SHA1NEXTErr], (instrs SHA1MSG2rr, SHA1NEXTErr)>;
1282
1283def Zn4Writerm_SHA1MSG2rm_SHA1NEXTErm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPU0123]> {
1284  let Latency = !add(Znver4Model.LoadLatency, Zn4WriteSHA1MSG2rr_SHA1NEXTErr.Latency);
1285  let ReleaseAtCycles = [1, 1, 2];
1286  let NumMicroOps = !add(Zn4WriteSHA1MSG2rr_SHA1NEXTErr.NumMicroOps, 0);
1287}
1288def : InstRW<[Zn4Writerm_SHA1MSG2rm_SHA1NEXTErm], (instrs SHA1MSG2rm, SHA1NEXTErm)>;
1289
1290def Zn4WriteSHA256MSG1rr : SchedWriteRes<[Zn4FPU0123]> {
1291  let Latency = 2;
1292  let ReleaseAtCycles = [3];
1293  let NumMicroOps = 2;
1294}
1295def : InstRW<[Zn4WriteSHA256MSG1rr], (instrs SHA256MSG1rr)>;
1296
1297def Zn4Writerm_SHA256MSG1rm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPU0123]> {
1298  let Latency = !add(Znver4Model.LoadLatency, Zn4WriteSHA256MSG1rr.Latency);
1299  let ReleaseAtCycles = [1, 1, 3];
1300  let NumMicroOps = !add(Zn4WriteSHA256MSG1rr.NumMicroOps, 0);
1301}
1302def : InstRW<[Zn4Writerm_SHA256MSG1rm], (instrs SHA256MSG1rm)>;
1303
1304def Zn4WriteSHA256MSG2rr : SchedWriteRes<[Zn4FPU0123]> {
1305  let Latency = 3;
1306  let ReleaseAtCycles = [8];
1307  let NumMicroOps = 4;
1308}
1309def : InstRW<[Zn4WriteSHA256MSG2rr], (instrs SHA256MSG2rr)>;
1310
1311def Zn4WriteSHA256MSG2rm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPU0123]> {
1312  let Latency = !add(Znver4Model.LoadLatency, Zn4WriteSHA256MSG2rr.Latency);
1313  let ReleaseAtCycles = [1, 1, 8];
1314  let NumMicroOps = !add(Zn4WriteSHA256MSG2rr.NumMicroOps, 1);
1315}
1316def : InstRW<[Zn4WriteSHA256MSG2rm], (instrs SHA256MSG2rm)>;
1317
1318def Zn4WriteSHA1RNDS4rri : SchedWriteRes<[Zn4FPU0123]> {
1319  let Latency = 6;
1320  let ReleaseAtCycles = [8];
1321  let NumMicroOps = 1;
1322}
1323def : InstRW<[Zn4WriteSHA1RNDS4rri], (instrs SHA1RNDS4rri)>;
1324
1325def Zn4WriteSHA256RNDS2rr : SchedWriteRes<[Zn4FPU0123]> {
1326  let Latency = 4;
1327  let ReleaseAtCycles = [8];
1328  let NumMicroOps = 1;
1329}
1330def : InstRW<[Zn4WriteSHA256RNDS2rr], (instrs SHA256RNDS2rr)>;
1331
1332// Strings instructions.
1333// Packed Compare Implicit Length Strings, Return Mask
1334defm : Zn4WriteResXMMPair<WritePCmpIStrM, [Zn4FPVAdd0123], 6, [8], 3, /*LoadUOps=*/1>;
1335// Packed Compare Explicit Length Strings, Return Mask
1336defm : Zn4WriteResXMMPair<WritePCmpEStrM, [Zn4FPVAdd0123], 6, [12], 7, /*LoadUOps=*/5>;
1337// Packed Compare Implicit Length Strings, Return Index
1338defm : Zn4WriteResXMMPair<WritePCmpIStrI, [Zn4FPVAdd0123], 2, [8], 4>;
1339// Packed Compare Explicit Length Strings, Return Index
1340defm : Zn4WriteResXMMPair<WritePCmpEStrI, [Zn4FPVAdd0123], 6, [12], 8, /*LoadUOps=*/4>;
1341
1342// AES instructions.
1343defm : Zn4WriteResXMMPair<WriteAESDecEnc, [Zn4FPAES01], 4, [1], 1>; // Decryption, encryption.
1344defm : Zn4WriteResXMMPair<WriteAESIMC, [Zn4FPAES01], 4, [1], 1>; // InvMixColumn.
1345defm : Zn4WriteResXMMPair<WriteAESKeyGen, [Zn4FPAES01], 4, [1], 1>; // Key Generation.
1346
1347// Carry-less multiplication instructions.
1348defm : Zn4WriteResXMMPair<WriteCLMul, [Zn4FPCLM01], 4, [4], 4>;
1349
1350// EMMS/FEMMS
1351defm : Zn4WriteResInt<WriteEMMS, [Zn4ALU0123], 2, [1], 1>; // FIXME: latency not from llvm-exegesis
1352
1353// Load/store MXCSR
1354defm : Zn4WriteResInt<WriteLDMXCSR, [Zn4AGU012, Zn4Load, Zn4ALU0123], !add(Znver4Model.LoadLatency, 1), [1, 1, 6], 1>; // FIXME: latency not from llvm-exegesis
1355defm : Zn4WriteResInt<WriteSTMXCSR, [Zn4ALU0123, Zn4AGU012, Zn4Store], !add(1, Znver4Model.StoreLatency), [60, 1, 1], 2>; // FIXME: latency not from llvm-exegesis
1356
1357// Catch-all for expensive system instructions.
1358defm : Zn4WriteResInt<WriteSystem, [Zn4ALU0123], 100, [100], 100>;
1359
1360def Zn4WriteVZEROUPPER : SchedWriteRes<[Zn4FPU0123]> {
1361  let Latency = 0; // FIXME: not from llvm-exegesis
1362  let ReleaseAtCycles = [1];
1363  let NumMicroOps = 1;
1364}
1365def : InstRW<[Zn4WriteVZEROUPPER], (instrs VZEROUPPER)>;
1366
1367def Zn4WriteVZEROALL : SchedWriteRes<[Zn4FPU0123]> {
1368  let Latency = 10; // FIXME: not from llvm-exegesis
1369  let ReleaseAtCycles = [24];
1370  let NumMicroOps = 18;
1371}
1372def : InstRW<[Zn4WriteVZEROALL], (instrs VZEROALL)>;
1373
1374// AVX2.
1375defm : Zn4WriteResYMMPair<WriteFShuffle256, [Zn4FPVShuf], 2, [1], 1, /*LoadUOps=*/2>; // Fp 256-bit width vector shuffles.
1376defm : Zn4WriteResYMMPair<WriteFVarShuffle256, [Zn4FPVShuf], 7, [1], 2, /*LoadUOps=*/1>; // Fp 256-bit width variable shuffles.
1377defm : Zn4WriteResYMMPair<WriteShuffle256, [Zn4FPVShuf], 1, [1], 1>; // 256-bit width vector shuffles.
1378
1379def Zn4WriteVPERM2I128rr_VPERM2F128rr : SchedWriteRes<[Zn4FPVShuf]> {
1380  let Latency = 3;
1381  let ReleaseAtCycles = [1];
1382  let NumMicroOps = 1;
1383}
1384def : InstRW<[Zn4WriteVPERM2I128rr_VPERM2F128rr], (instrs VPERM2I128rr, VPERM2F128rr)>;
1385
1386def Zn4WriteVPERM2F128rm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPVShuf]> {
1387  let Latency = !add(Znver4Model.LoadLatency, Zn4WriteVPERM2I128rr_VPERM2F128rr.Latency);
1388  let ReleaseAtCycles = [1, 1, 1];
1389  let NumMicroOps = !add(Zn4WriteVPERM2I128rr_VPERM2F128rr.NumMicroOps, 0);
1390}
1391def : InstRW<[Zn4WriteVPERM2F128rm], (instrs VPERM2F128rm)>;
1392
1393def Zn4WriteVPERMPSYrr : SchedWriteRes<[Zn4FPVShuf]> {
1394  let Latency = 7;
1395  let ReleaseAtCycles = [1];
1396  let NumMicroOps = 2;
1397}
1398def : InstRW<[Zn4WriteVPERMPSYrr], (instrs VPERMPSYrr)>;
1399
1400def Zn4WriteVPERMPSYrm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPVShuf]> {
1401  let Latency = !add(Znver4Model.LoadLatency, Zn4WriteVPERMPSYrr.Latency);
1402  let ReleaseAtCycles = [1, 1, 2];
1403  let NumMicroOps = !add(Zn4WriteVPERMPSYrr.NumMicroOps, 1);
1404}
1405def : InstRW<[Zn4WriteVPERMPSYrm], (instrs VPERMPSYrm)>;
1406
1407def Zn4WriteVPERMYri : SchedWriteRes<[Zn4FPVShuf]> {
1408  let Latency = 6;
1409  let ReleaseAtCycles = [1];
1410  let NumMicroOps = 2;
1411}
1412def : InstRW<[Zn4WriteVPERMYri], (instrs VPERMPDYri, VPERMQYri)>;
1413
1414def Zn4WriteVPERMPDYmi : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPVShuf]> {
1415  let Latency = !add(Znver4Model.LoadLatency, Zn4WriteVPERMYri.Latency);
1416  let ReleaseAtCycles = [1, 1, 2];
1417  let NumMicroOps = !add(Zn4WriteVPERMYri.NumMicroOps, 1);
1418}
1419def : InstRW<[Zn4WriteVPERMPDYmi], (instrs VPERMPDYmi)>;
1420
1421def Zn4WriteVPERMDYrr : SchedWriteRes<[Zn4FPVShuf]> {
1422  let Latency = 5;
1423  let ReleaseAtCycles = [1];
1424  let NumMicroOps = 2;
1425}
1426def : InstRW<[Zn4WriteVPERMDYrr], (instrs VPERMDYrr)>;
1427
1428def Zn4WriteVPERMYm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPVShuf]> {
1429  let Latency = !add(Znver4Model.LoadLatency, Zn4WriteVPERMDYrr.Latency);
1430  let ReleaseAtCycles = [1, 1, 2];
1431  let NumMicroOps = !add(Zn4WriteVPERMDYrr.NumMicroOps, 0);
1432}
1433def : InstRW<[Zn4WriteVPERMYm], (instrs VPERMQYmi, VPERMDYrm)>;
1434
1435defm : Zn4WriteResYMMPair<WriteVPMOV256, [Zn4FPVShuf01], 4, [3], 2, /*LoadUOps=*/-1>; // 256-bit width packed vector width-changing move.
1436defm : Zn4WriteResYMMPair<WriteVarShuffle256, [Zn4FPVShuf01], 1, [1], 2>; // 256-bit width vector variable shuffles.
1437defm : Zn4WriteResXMMPair<WriteVarVecShift, [Zn4FPVShift01], 1, [1], 1>; // Variable vector shifts.
1438defm : Zn4WriteResYMMPair<WriteVarVecShiftY, [Zn4FPVShift01], 1, [1], 1>; // Variable vector shifts (YMM).
1439defm : Zn4WriteResZMMPair<WriteVarVecShiftZ, [Zn4FPVShift01], 1, [2], 2>; // Variable vector shifts (ZMM).
1440
1441// Old microcoded instructions that nobody use.
1442defm : Zn4WriteResInt<WriteMicrocoded, [Zn4ALU0123], 100, [100], 100>;
1443
1444// Fence instructions.
1445defm : Zn4WriteResInt<WriteFence, [Zn4ALU0123], 1, [100], 1>;
1446
1447def Zn4WriteLFENCE : SchedWriteRes<[Zn4LSU]> {
1448  let Latency = 1;
1449  let ReleaseAtCycles = [30];
1450  let NumMicroOps = 1;
1451}
1452def : InstRW<[Zn4WriteLFENCE], (instrs LFENCE)>;
1453
1454def Zn4WriteSFENCE : SchedWriteRes<[Zn4LSU]> {
1455  let Latency = 1;
1456  let ReleaseAtCycles = [1];
1457  let NumMicroOps = 1;
1458}
1459def : InstRW<[Zn4WriteSFENCE], (instrs SFENCE)>;
1460
1461// Nop, not very useful expect it provides a model for nops!
1462defm : Zn4WriteResInt<WriteNop, [Zn4ALU0123], 0, [1], 1>; // FIXME: latency not from llvm-exegesis
1463
1464
1465///////////////////////////////////////////////////////////////////////////////
1466// Zero Cycle Move
1467///////////////////////////////////////////////////////////////////////////////
1468
1469def Zn4WriteZeroLatency : SchedWriteRes<[]> {
1470  let Latency = 0;
1471  let ReleaseAtCycles = [];
1472  let NumMicroOps = 1;
1473}
1474def : InstRW<[Zn4WriteZeroLatency], (instrs MOV32rr, MOV32rr_REV,
1475                                               MOV64rr, MOV64rr_REV,
1476                                               MOVSX32rr32)>;
1477
1478def Zn4WriteSwapRenameable : SchedWriteRes<[]> {
1479  let Latency = 0;
1480  let ReleaseAtCycles = [];
1481  let NumMicroOps = 2;
1482}
1483def : InstRW<[Zn4WriteSwapRenameable], (instrs XCHG32rr, XCHG32ar,
1484                                               XCHG64rr, XCHG64ar)>;
1485
1486defm : Zn4WriteResInt<WriteXCHG, [Zn4ALU0123], 0, [8], 2>;        // Compare+Exchange - TODO RMW support.
1487
1488defm : Zn4WriteResXMM<WriteFMoveX, [], 0, [], 1>;
1489defm : Zn4WriteResYMM<WriteFMoveY, [], 0, [], 1>;
1490defm : Zn4WriteResYMM<WriteFMoveZ, [], 0, [], 1>;
1491
1492defm : Zn4WriteResXMM<WriteVecMove, [Zn4FPFMisc0123], 1, [1], 1>; // MMX
1493defm : Zn4WriteResXMM<WriteVecMoveX, [], 0, [], 1>;
1494defm : Zn4WriteResYMM<WriteVecMoveY, [], 0, [], 1>;
1495defm : Zn4WriteResYMM<WriteVecMoveZ, [], 0, [], 1>;
1496
1497def : IsOptimizableRegisterMove<[
1498  InstructionEquivalenceClass<[
1499    // GPR variants.
1500    MOV32rr, MOV32rr_REV,
1501    MOV64rr, MOV64rr_REV,
1502    MOVSX32rr32,
1503    XCHG32rr, XCHG32ar,
1504    XCHG64rr, XCHG64ar,
1505
1506    // MMX variants.
1507    // MMX moves are *NOT* eliminated.
1508
1509    // SSE variants.
1510    MOVAPSrr, MOVAPSrr_REV,
1511    MOVUPSrr, MOVUPSrr_REV,
1512    MOVAPDrr, MOVAPDrr_REV,
1513    MOVUPDrr, MOVUPDrr_REV,
1514    MOVDQArr, MOVDQArr_REV,
1515    MOVDQUrr, MOVDQUrr_REV,
1516
1517    // AVX variants.
1518    VMOVAPSrr, VMOVAPSrr_REV,
1519    VMOVUPSrr, VMOVUPSrr_REV,
1520    VMOVAPDrr, VMOVAPDrr_REV,
1521    VMOVUPDrr, VMOVUPDrr_REV,
1522    VMOVDQArr, VMOVDQArr_REV,
1523    VMOVDQUrr, VMOVDQUrr_REV,
1524
1525    // AVX YMM variants.
1526    VMOVAPSYrr, VMOVAPSYrr_REV,
1527    VMOVUPSYrr, VMOVUPSYrr_REV,
1528    VMOVAPDYrr, VMOVAPDYrr_REV,
1529    VMOVUPDYrr, VMOVUPDYrr_REV,
1530    VMOVDQAYrr, VMOVDQAYrr_REV,
1531    VMOVDQUYrr, VMOVDQUYrr_REV,
1532  ], TruePred >
1533]>;
1534
1535// FIXUP and RANGE Instructions
1536def Zn4WriteVFIXUPIMMPDZrr_VRANGESDrr : SchedWriteRes<[Zn4FPFMisc01]> {
1537  let Latency = 2;
1538  let ReleaseAtCycles = [2];
1539  let NumMicroOps = 1;
1540}
1541def : InstRW<[Zn4WriteVFIXUPIMMPDZrr_VRANGESDrr], (instregex
1542	"VFIXUPIMM(S|P)(S|D)(Z|Z128|Z256?)rrik", "VFIXUPIMM(S|P)(S|D)(Z?|Z128?|Z256?)rrikz",
1543        "VFIXUPIMM(S|P)(S|D)(Z128|Z256?)rri",  "VRANGE(S|P)(S|D)(Z?|Z128?|Z256?)rri(b?)",
1544	"VRANGE(S|P)(S|D)(Z|Z128|Z256?)rri(b?)k","VRANGE(S|P)(S|D)(Z?|Z128?|Z256?)rri(b?)kz"
1545	)>;
1546
1547// SCALE & REDUCE instructions
1548def Zn4WriteSCALErr: SchedWriteRes<[Zn4FPFMisc23]> {
1549  let Latency = 6;
1550  let ReleaseAtCycles = [6];
1551  let NumMicroOps = 2;
1552}
1553def : InstRW<[Zn4WriteSCALErr], (instregex
1554        "V(SCALEF|REDUCE)(S|P)(S|D)(Z?|Z128?|Z256?)(rr|rrb|rrkz|rrik|rrikz|rri)(_Int?|_Intkz?)",
1555        "(V?)REDUCE(PD|PS|SD|SS)(Z?|Z128?)(rri|rrikz|rrib)"
1556	)>;
1557
1558//BF16PS Instructions
1559def Zn4WriteBF16: SchedWriteRes<[Zn4FPFMisc23]> {
1560  let Latency = 6;
1561  let ReleaseAtCycles = [6];
1562  let NumMicroOps = 2;
1563}
1564def : InstRW<[Zn4WriteBF16], (instregex
1565        "(V?)DPBF16PS(Z?|Z128?|Z256?)(r|rk|rkz)"
1566	)>;
1567
1568// BUSD and VPMADD Instructions
1569def Zn4WriteBUSDr_VPMADDr: SchedWriteRes<[Zn4FPFMisc01]> {
1570  let Latency = 4;
1571  let ReleaseAtCycles = [4];
1572  let NumMicroOps = 1;
1573}
1574def : InstRW<[Zn4WriteBUSDr_VPMADDr], (instregex
1575	"VPDP(BU|WS)(S|P)(S|D|DS)(Z|Z128|Z256)(r|rk|rkz)",
1576        "VPMADD52(H|L)UQ(Z|Z128|Z256)(r|rk|rkz)"
1577	)>;
1578
1579// SHIFT instructions
1580def Zn4WriteSHIFTrr: SchedWriteRes<[Zn4FPFMisc01]> {
1581  let Latency = 2;
1582  let ReleaseAtCycles = [2];
1583  let NumMicroOps = 1;
1584}
1585def : InstRW<[Zn4WriteSHIFTrr], (instregex
1586        "VP(LZCNT|SHLD|SHRD?)(D|Q|W|VD|VQ|VW?)(Z?|Z128?|Z256?)(rr|rk|rrk|rrkz|rri|rrik|rrikz)",
1587        "(V?)P(SLL|SRL|SRA)(D|Q|W|DQ)(Y?|Z?|Z128?|Z256?)(rr|rrk|rrkz)",
1588        "(V?)P(SLL|SRL|SRA)DQYri",
1589        "(V?)P(SLL|SRL)DQ(Z?|Z256?)ri",
1590        "(V?)P(SHUFB)(Y|Z|Z128|Z256?)(rr|rrk|rrkz)",
1591        "(V?)P(ROL|ROR)(D|Q|VD|VQ)(Z?|Z128?|Z256?)(rr|rrk|rrkz)",
1592        "(V?)P(ROL|ROR)(D|Q|VD|VQ)(Z256?)(ri|rik|rikz)",
1593        "(V?)P(ROL|ROR)(D|Q)(Z?|Z128?)(ri|rik|rikz)",
1594	"VPSHUFBITQMBZ128rr", "VFMSUB231SSZr_Intkz"
1595	)>;
1596
1597def Zn4WriteSHIFTri: SchedWriteRes<[Zn4FPFMisc01]> {
1598  let Latency = 1;
1599  let ReleaseAtCycles = [1];
1600  let NumMicroOps = 1;
1601}
1602def : InstRW<[Zn4WriteSHIFTri], (instregex
1603        "VP(SLL|SRL|SRA)(D|Q|W)(Z|Z128|Z256?)(ri|rik|rikz)"
1604	)>;
1605
1606// ALIGN Instructions
1607def Zn4WriteALIGN: SchedWriteRes<[Zn4FPFMisc12]> {
1608  let Latency = 2;
1609  let ReleaseAtCycles = [2];
1610  let NumMicroOps = 1;
1611}
1612def : InstRW<[Zn4WriteALIGN], (instregex
1613        "(V?)PALIGNR(Z?|Z128?|Z256?)(rri|rrik|rrikz)"
1614	)>;
1615
1616//PACK Instructions
1617def Zn4WritePACK: SchedWriteRes<[Zn4FPFMisc12]> {
1618  let Latency = 2;
1619  let ReleaseAtCycles = [2];
1620  let NumMicroOps = 1;
1621}
1622def : InstRW<[Zn4WritePACK], (instregex
1623        "(V?)PACK(SS|US)(DW|WB)(Z?|Z128?|Z256?)(rr|rrk|rrkz)"
1624	)>;
1625
1626// MAX and MIN Instructions
1627def Zn4WriteFCmp64: SchedWriteRes<[Zn4FPFMisc01]> {
1628  let Latency = 2;
1629  let ReleaseAtCycles = [2];
1630  let NumMicroOps = 1;
1631}
1632def : InstRW<[Zn4WriteFCmp64], (instregex
1633        "(V?)CMP(S|P)(S|D)(rr|rri|rr_Int)",
1634        "(V?|VP?)(MAX|MIN|MINC|MAXC)(S|P|U)(S|D|Q)(Z?|Z128?|Z256?)(rr|rri|rrk|rrkz)(_Int?)",
1635        "VP(MAX|MIN)(SQ|UQ)(Z|Z128|Z256)(rr|rrk|rrkz)",
1636        "(V?)(MAX|MAXC|MIN|MINC)PD(Z|Z128|Z256?)(rr|rrk|rrkz)"
1637	)>;
1638
1639// MOV Instructions
1640def Zn4MOVS: SchedWriteRes<[Zn4FPFMisc12]> {
1641  let Latency = 2;
1642  let ReleaseAtCycles = [2];
1643  let NumMicroOps = 1;
1644}
1645def : InstRW<[Zn4MOVS], (instregex
1646        "(V?)PMOV(SX|ZX)(BD|BQ|BW|WD|WQ|DQ)(Z128?|Z256?)(rr|rrk|rrkz)",
1647        "(V?)PMOV(SX|QD|UZ|ZX)(BD|BQ|BW?)(Y|Z128?)(rr|rrk|rrkz)",
1648        "(V?)PMOV(SX|US|ZX)(DQ|WD|QW|WQ?)(Y|Z128?)(rr|rrk|rrkz)",
1649        "(V?)VMOVDDUP(Z|Z128|Z256)(rr|rrk|rrkz)",
1650        "VPMOV(DB|DW|QB|QD|QW|SDB|SDW|SQB|SQD|SQW|SWB|USDB|USDW|USQB|USQD|USWB|WB)(Z128?)(rr|rrk|rrkz)"
1651	)>;
1652
1653def Zn4MOVSZ: SchedWriteRes<[Zn4FPFMisc12]> {
1654  let Latency = 4;
1655  let ReleaseAtCycles = [4];
1656  let NumMicroOps = 1;
1657}
1658def : InstRW<[Zn4MOVSZ], (instregex
1659        "(V?)PMOV(SX|ZX)(BD|BQ|BW|WD|WQ|DQ)(Z?)(rr|rrk|rrkz)"
1660	)>;
1661
1662def Zn4MOVSrr: SchedWriteRes<[Zn4FPFMisc12]> {
1663  let Latency = 5;
1664  let ReleaseAtCycles = [5];
1665  let NumMicroOps = 1;
1666}
1667def : InstRW<[Zn4MOVSrr], (instregex
1668        "(V?)PMOV(DB|QB|QW|SDB|SQB|SQW|USDB|USQB|USQW)(Z?)(rr|rrk|rrkz)"
1669	)>;
1670
1671
1672//VPTEST Instructions
1673def Zn4VPTESTZ128: SchedWriteRes<[Zn4FPFMisc01]> {
1674  let Latency = 3;
1675  let ReleaseAtCycles = [3];
1676  let NumMicroOps = 1;
1677}
1678def : InstRW<[Zn4VPTESTZ128], (instregex
1679        "(V?)PTEST(N?)(MB|MD|MQ|MW)(Z128?)(rrk)"
1680	)>;
1681
1682def Zn4VPTESTZ256: SchedWriteRes<[Zn4FPFMisc01]> {
1683  let Latency = 4;
1684  let ReleaseAtCycles = [4];
1685  let NumMicroOps = 1;
1686}
1687def : InstRW<[Zn4VPTESTZ256], (instregex
1688        "(V?)PTEST(N?)(MB|MD|MQ|MW)(Z256?)(rr|rrk)"
1689	)>;
1690
1691def Zn4VPTESTZ: SchedWriteRes<[Zn4FPFMisc01]> {
1692  let Latency = 5;
1693  let ReleaseAtCycles = [5];
1694  let NumMicroOps = 1;
1695}
1696def : InstRW<[Zn4VPTESTZ], (instregex
1697        "(V?)PTEST(N?)(MB|MD|MQ|MW)(Z?)(rrk)"
1698	)>;
1699
1700// CONFLICT Instructions
1701def Zn4CONFLICTZ128: SchedWriteRes<[Zn4FPFMisc01]> {
1702  let Latency = 2;
1703  let ReleaseAtCycles = [2];
1704  let NumMicroOps = 1;
1705}
1706def : InstRW<[Zn4CONFLICTZ128], (instregex
1707        "VPCONFLICT(D|Q)(Z128)(rr|rrk|rrkz)"
1708	)>;
1709
1710def Zn4CONFLICTrr: SchedWriteRes<[Zn4FPFMisc01,Zn4FPFMisc12,Zn4FPFMisc23]> {
1711  let Latency = 6;
1712  let ReleaseAtCycles = [2,2,2];
1713  let NumMicroOps = 4;
1714}
1715def : InstRW<[Zn4CONFLICTrr], (instregex
1716        "VPCONFLICT(D|Q)(Z|Z256)(rr|rrkz)"
1717	)>;
1718
1719// RSQRT Instructions
1720def Zn4VRSQRT14PDZ256: SchedWriteRes<[Zn4FPFMisc01]> {
1721  let Latency = 5;
1722  let ReleaseAtCycles = [2];
1723  let NumMicroOps = 1;
1724}
1725def : InstRW<[Zn4VRSQRT14PDZ256], (instregex
1726        "VRSQRT14(PD|PS)(Z?|Z128?|Z256?)(r|rr|rk|rrk|rkz|rrkz)"
1727	)>;
1728
1729
1730// PERM Instructions
1731def Zn4PERMILP: SchedWriteRes<[Zn4FPFMisc123]> {
1732  let Latency = 2;
1733  let ReleaseAtCycles = [2];
1734  let NumMicroOps = 1;
1735}
1736def : InstRW<[Zn4PERMILP], (instregex
1737        "VPERMILP(S|D)(Y|Z|Z128|Z256)(rr|rrk|rrkz)"
1738	)>;
1739
1740def Zn4PERMIT2_128: SchedWriteRes<[Zn4FPFMisc12]> {
1741  let Latency = 3;
1742  let ReleaseAtCycles = [2];
1743  let NumMicroOps = 1;
1744}
1745def : InstRW<[Zn4PERMIT2_128], (instregex
1746	"VPERM(I2|T2)(PS|PD|W)Z128(rr|rrk|rrkz)",
1747	"VPERM(I2|T2)(B|D|Q)Z128(rr|rrk|rrkz)"
1748	)>;
1749
1750def Zn4PERMIT2_128rr:SchedWriteRes<[Zn4FPFMisc12]> {
1751  let Latency = 2;
1752  let ReleaseAtCycles = [2];
1753  let NumMicroOps = 1;
1754}
1755def : InstRW<[Zn4PERMIT2_128rr], (instregex
1756	"V(P?)COMPRESS(B|W|D|Q|PD|PS|SD|SQ)Z128(rr|rrk|rrkz)",
1757	"VPERM(B|D|Q|W)(Z128?)(rr|rrk|rrkz)"
1758	)>;
1759
1760def Zn4PERMIT2_256: SchedWriteRes<[Zn4FPFMisc12]> {
1761  let Latency = 4;
1762  let ReleaseAtCycles = [2];
1763  let NumMicroOps = 1;
1764}
1765def : InstRW<[Zn4PERMIT2_256], (instregex
1766	"VPERM(I2|T2)(PS|PD|W)Z256(rr|rrk|rrkz)",
1767	"VPERMP(S|D)Z256(rr|rrk|rrkz)",
1768	"V(P?)COMPRESS(B|W|D|Q|PD|PS|SD|SQ)Z256(rr|rrk|rrkz)",
1769	"VPERM(B|D|Q|W)Z256(rr|rrk|rrkz)",
1770	"VPERM(I2|Q|T2)(B|D|Q)Z256(rr|rrk|rrkz)",
1771	"VPEXPAND(B|W)Z256(rr|rrk|rrkz)"
1772	)>;
1773
1774def Zn4PERMIT2Z: SchedWriteRes<[Zn4FPFMisc12]> {
1775  let Latency = 5;
1776  let ReleaseAtCycles = [2];
1777  let NumMicroOps = 1;
1778}
1779def : InstRW<[Zn4PERMIT2Z], (instregex
1780	"VPERM(I2|T2)(PS|PD|W)Z(rr|rrk|rrkz)",
1781	"VPERM(B|D|W)Z(rr|rrk|rrkz)",
1782	"VPERM(I2|Q|T2)(B|D|Q)Z(rr|rrk|rrkz)",
1783	"V(P?)COMPRESS(B|W|D|Q|PD|PS|SD|SQ)Z(rr|rrk|rrkz)",
1784	"VPEXPAND(B|W)Z(rr|rrk|rrkz)",
1785	"VPERMP(S|D)Z(rr|rrk|rrkz)"
1786	)>;
1787
1788// ALU SLOW Misc Instructions
1789def Zn4VecALUZSlow: SchedWriteRes<[Zn4FPFMisc01]> {
1790  let Latency = 2;
1791  let ReleaseAtCycles = [2];
1792  let NumMicroOps = 1;
1793}
1794def : InstRW<[Zn4VecALUZSlow], (instrs
1795	VPABSBZ128rr,      VPABSBZ128rrk,  VPABSBZ128rrkz,   VPABSDZ128rr,
1796	VPABSDZ128rrk,     VPABSDZ128rrkz, VPABSQZ128rr,     VPABSQZ128rrk,
1797	VPABSQZ128rrkz,    VPABSWZ128rr,   VPABSWZ128rrk,    VPABSWZ128rrkz,
1798	VPADDSBZ128rr,     VPADDSBZ128rrk, VPADDSBZ128rrkz,  VPADDSWZ128rr,
1799	VPADDSWZ128rrk,    VPADDSWZ128rrkz,VPADDUSBZ128rr,   VPADDUSBZ128rrk,
1800	VPADDUSBZ128rrkz,  VPADDUSWZ128rr, VPADDUSWZ128rrk,  VPADDUSWZ128rrkz,
1801	VPAVGBZ128rr,      VPAVGBZ128rrk,  VPAVGBZ128rrkz,   VPAVGWZ128rr,
1802	VPAVGWZ128rrk,     VPAVGWZ128rrkz, VPOPCNTBZ128rr,   VPOPCNTBZ128rrk,
1803	VPOPCNTBZ128rrkz,  VPOPCNTDZ128rr, VPOPCNTDZ128rrk,  VPOPCNTDZ128rrkz,
1804	VPOPCNTQZ128rr,    VPOPCNTQZ128rrk,VPOPCNTQZ128rrkz, VPOPCNTWZ128rr,
1805	VPOPCNTWZ128rrk,   VPOPCNTWZ128rrkz,VPSUBSBZ128rr,   VPSUBSBZ128rrk,
1806	VPSUBSBZ128rrkz,   VPSUBSWZ128rr,   VPSUBSWZ128rrk,  VPSUBSWZ128rrkz,
1807	VPSUBUSBZ128rr,    VPSUBUSBZ128rrk, VPSUBUSBZ128rrkz,VPSUBUSWZ128rr,
1808	VPSUBUSWZ128rrk,   VPSUBUSWZ128rrkz
1809	)>;
1810
1811
1812///////////////////////////////////////////////////////////////////////////////
1813// Dependency breaking instructions.
1814///////////////////////////////////////////////////////////////////////////////
1815
1816def Zn4WriteZeroIdiom : SchedWriteVariant<[
1817    SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn4WriteZeroLatency]>,
1818    SchedVar<NoSchedPred,                          [WriteALU]>
1819]>;
1820def : InstRW<[Zn4WriteZeroIdiom], (instrs XOR32rr, XOR32rr_REV,
1821                                          XOR64rr, XOR64rr_REV,
1822                                          SUB32rr, SUB32rr_REV,
1823                                          SUB64rr, SUB64rr_REV)>;
1824
1825def Zn4WriteZeroIdiomEFLAGS : SchedWriteVariant<[
1826    SchedVar<MCSchedPredicate<CheckSameRegOperand<0, 1>>, [Zn4WriteZeroLatency]>,
1827    SchedVar<NoSchedPred,                                 [WriteALU]>
1828]>;
1829def : InstRW<[Zn4WriteZeroIdiomEFLAGS], (instrs CMP8rr,  CMP8rr_REV,
1830                                                CMP16rr, CMP16rr_REV,
1831                                                CMP32rr, CMP32rr_REV,
1832                                                CMP64rr, CMP64rr_REV)>;
1833
1834def Zn4WriteFZeroIdiom : SchedWriteVariant<[
1835    SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn4WriteZeroLatency]>,
1836    SchedVar<NoSchedPred,                          [WriteFLogic]>
1837]>;
1838// NOTE: XORPSrr, XORPDrr are not zero-cycle!
1839def : InstRW<[Zn4WriteFZeroIdiom], (instrs VXORPSrr, VXORPDrr,
1840                                           VANDNPSrr, VANDNPDrr)>;
1841
1842def Zn4WriteFZeroIdiomY : SchedWriteVariant<[
1843    SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn4WriteZeroLatency]>,
1844    SchedVar<NoSchedPred,                          [WriteFLogicY]>
1845]>;
1846def : InstRW<[Zn4WriteFZeroIdiomY], (instrs VXORPSYrr, VXORPDYrr,
1847                                            VANDNPSYrr, VANDNPDYrr)>;
1848
1849def Zn4WriteVZeroIdiomLogicX : SchedWriteVariant<[
1850    SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn4WriteZeroLatency]>,
1851    SchedVar<NoSchedPred,                          [WriteVecLogicX]>
1852]>;
1853// NOTE: PXORrr,PANDNrr are not zero-cycle!
1854def : InstRW<[Zn4WriteVZeroIdiomLogicX], (instrs VPXORrr, VPANDNrr)>;
1855
1856def Zn4WriteVZeroIdiomLogicY : SchedWriteVariant<[
1857    SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn4WriteZeroLatency]>,
1858    SchedVar<NoSchedPred,                          [WriteVecLogicY]>
1859]>;
1860def : InstRW<[Zn4WriteVZeroIdiomLogicY], (instrs VPXORYrr, VPANDNYrr)>;
1861
1862def Zn4WriteVZeroIdiomALUX : SchedWriteVariant<[
1863    SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn4WriteZeroLatency]>,
1864    SchedVar<NoSchedPred,                          [WriteVecALUX]>
1865]>;
1866// NOTE: PSUBBrr, PSUBWrr, PSUBDrr, PSUBQrr,
1867//       PCMPGTBrr, PCMPGTWrr, PCMPGTDrr, PCMPGTQrr are not zero-cycle!
1868def : InstRW<[Zn4WriteVZeroIdiomALUX],
1869             (instrs VPSUBBrr, VPSUBWrr, VPSUBDrr, VPSUBQrr,
1870                     VPCMPGTBrr, VPCMPGTWrr, VPCMPGTDrr, VPCMPGTQrr)>;
1871
1872def Zn4WriteVZeroIdiomALUY : SchedWriteVariant<[
1873    SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn4WriteZeroLatency]>,
1874    SchedVar<NoSchedPred,                          [WriteVecALUY]>
1875]>;
1876def : InstRW<[Zn4WriteVZeroIdiomALUY],
1877             (instrs VPSUBBYrr, VPSUBWYrr, VPSUBDYrr, VPSUBQYrr,
1878                     VPCMPGTBYrr, VPCMPGTWYrr, VPCMPGTDYrr, VPCMPGTQYrr)>;
1879
1880def : IsZeroIdiomFunction<[
1881  // GPR Zero-idioms.
1882  DepBreakingClass<[ XOR32rr, XOR32rr_REV,
1883                     XOR64rr, XOR64rr_REV,
1884                     SUB32rr, SUB32rr_REV,
1885                     SUB64rr, SUB64rr_REV ], ZeroIdiomPredicate>,
1886
1887  // SSE XMM Zero-idioms.
1888  DepBreakingClass<[
1889    // fp variants.
1890    XORPSrr, XORPDrr,
1891    ANDNPSrr, ANDNPDrr,
1892
1893    // int variants.
1894    PXORrr,
1895    PANDNrr,
1896    PSUBBrr, PSUBWrr, PSUBDrr, PSUBQrr,
1897    PSUBSBrr, PSUBSWrr,
1898    PSUBUSBrr, PSUBUSWrr,
1899    PCMPGTBrr, PCMPGTWrr, PCMPGTDrr, PCMPGTQrr
1900  ], ZeroIdiomPredicate>,
1901
1902  // AVX XMM Zero-idioms.
1903  DepBreakingClass<[
1904    // fp variants.
1905    VXORPSrr, VXORPDrr,
1906    VANDNPSrr, VANDNPDrr,
1907
1908    // int variants.
1909    VPXORrr,
1910    VPANDNrr,
1911    VPSUBBrr, VPSUBWrr, VPSUBDrr, VPSUBQrr,
1912    VPSUBSBrr, VPSUBSWrr,
1913    VPSUBUSBrr, VPSUBUSWrr,
1914    VPCMPGTBrr, VPCMPGTWrr, VPCMPGTDrr, VPCMPGTQrr,
1915  ], ZeroIdiomPredicate>,
1916
1917  // AVX YMM Zero-idioms.
1918  DepBreakingClass<[
1919    // fp variants.
1920    VXORPSYrr, VXORPDYrr,
1921    VANDNPSYrr, VANDNPDYrr,
1922
1923    // int variants.
1924    VPXORYrr,
1925    VPANDNYrr,
1926    VPSUBBYrr, VPSUBWYrr, VPSUBDYrr, VPSUBQYrr,
1927    VPSUBSBYrr, VPSUBSWYrr,
1928    VPSUBUSBYrr, VPSUBUSWYrr,
1929    VPCMPGTBYrr, VPCMPGTWYrr, VPCMPGTDYrr, VPCMPGTQYrr
1930  ], ZeroIdiomPredicate>,
1931]>;
1932
1933def : IsDepBreakingFunction<[
1934  // GPR
1935  DepBreakingClass<[ SBB32rr, SBB32rr_REV,
1936                     SBB64rr, SBB64rr_REV ], ZeroIdiomPredicate>,
1937  DepBreakingClass<[ CMP8rr,  CMP8rr_REV,
1938                     CMP16rr, CMP16rr_REV,
1939                     CMP32rr, CMP32rr_REV,
1940                     CMP64rr, CMP64rr_REV ], CheckSameRegOperand<0, 1> >,
1941  // SSE
1942  DepBreakingClass<[
1943    PCMPEQBrr, PCMPEQWrr, PCMPEQDrr, PCMPEQQrr
1944  ], ZeroIdiomPredicate>,
1945
1946  // AVX XMM
1947  DepBreakingClass<[
1948    VPCMPEQBrr, VPCMPEQWrr, VPCMPEQDrr, VPCMPEQQrr
1949  ], ZeroIdiomPredicate>,
1950
1951  // AVX YMM
1952  DepBreakingClass<[
1953    VPCMPEQBYrr, VPCMPEQWYrr, VPCMPEQDYrr, VPCMPEQQYrr
1954  ], ZeroIdiomPredicate>,
1955]>;
1956
1957} // SchedModel
1958
1959