1//=- ARMScheduleA57.td - ARM Cortex-A57 Scheduling Defs -----*- tablegen -*-=//
2//
3//                     The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10// This file defines the machine model for ARM Cortex-A57 to support
11// instruction scheduling and other instruction cost heuristics.
12//
13//===----------------------------------------------------------------------===//
14
15//===----------------------------------------------------------------------===//
16// *** Common description and scheduling model parameters taken from AArch64 ***
17// The Cortex-A57 is a traditional superscalar microprocessor with a
18// conservative 3-wide in-order stage for decode and dispatch. Combined with the
19// much wider out-of-order issue stage, this produced a need to carefully
20// schedule micro-ops so that all three decoded each cycle are successfully
21// issued as the reservation station(s) simply don't stay occupied for long.
22// Therefore, IssueWidth is set to the narrower of the two at three, while still
23// modeling the machine as out-of-order.
24
25def IsCPSRDefinedPred : SchedPredicate<[{TII->isCPSRDefined(*MI)}]>;
26def IsCPSRDefinedAndPredicatedPred :
27  SchedPredicate<[{TII->isCPSRDefined(*MI) && TII->isPredicated(*MI)}]>;
28
29// Cortex A57 rev. r1p0 or later (false = r0px)
30def IsR1P0AndLaterPred : SchedPredicate<[{false}]>;
31
32// If Addrmode3 contains register offset (not immediate)
33def IsLdrAm3RegOffPred :
34  SchedPredicate<[{!TII->isAddrMode3OpImm(*MI, 1)}]>;
35// The same predicate with operand offset 2 and 3:
36def IsLdrAm3RegOffPredX2 :
37  SchedPredicate<[{!TII->isAddrMode3OpImm(*MI, 2)}]>;
38def IsLdrAm3RegOffPredX3 :
39  SchedPredicate<[{!TII->isAddrMode3OpImm(*MI, 3)}]>;
40
41// If Addrmode3 contains "minus register"
42def IsLdrAm3NegRegOffPred :
43  SchedPredicate<[{TII->isAddrMode3OpMinusReg(*MI, 1)}]>;
44// The same predicate with operand offset 2 and 3:
45def IsLdrAm3NegRegOffPredX2 :
46  SchedPredicate<[{TII->isAddrMode3OpMinusReg(*MI, 2)}]>;
47def IsLdrAm3NegRegOffPredX3 :
48  SchedPredicate<[{TII->isAddrMode3OpMinusReg(*MI, 3)}]>;
49
50// Load, scaled register offset, not plus LSL2
51def IsLdstsoScaledNotOptimalPredX0 :
52  SchedPredicate<[{TII->isLdstScaledRegNotPlusLsl2(*MI, 0)}]>;
53def IsLdstsoScaledNotOptimalPred :
54  SchedPredicate<[{TII->isLdstScaledRegNotPlusLsl2(*MI, 1)}]>;
55def IsLdstsoScaledNotOptimalPredX2 :
56  SchedPredicate<[{TII->isLdstScaledRegNotPlusLsl2(*MI, 2)}]>;
57
58// Load, scaled register offset
59def IsLdstsoScaledPred :
60  SchedPredicate<[{TII->isLdstScaledReg(*MI, 1)}]>;
61def IsLdstsoScaledPredX2 :
62  SchedPredicate<[{TII->isLdstScaledReg(*MI, 2)}]>;
63
64def IsLdstsoMinusRegPredX0 :
65  SchedPredicate<[{TII->isLdstSoMinusReg(*MI, 0)}]>;
66def IsLdstsoMinusRegPred :
67  SchedPredicate<[{TII->isLdstSoMinusReg(*MI, 1)}]>;
68def IsLdstsoMinusRegPredX2 :
69  SchedPredicate<[{TII->isLdstSoMinusReg(*MI, 2)}]>;
70
71// Load, scaled register offset
72def IsLdrAm2ScaledPred :
73  SchedPredicate<[{TII->isAm2ScaledReg(*MI, 1)}]>;
74
75// LDM, base reg in list
76def IsLdmBaseRegInList :
77  SchedPredicate<[{TII->isLDMBaseRegInList(*MI)}]>;
78
79class A57WriteLMOpsListType<list<SchedWriteRes> writes> {
80  list <SchedWriteRes> Writes = writes;
81  SchedMachineModel SchedModel = ?;
82}
83
84// *** Common description and scheduling model parameters taken from AArch64 ***
85// (AArch64SchedA57.td)
86def CortexA57Model : SchedMachineModel {
87  let IssueWidth        =   3; // 3-way decode and dispatch
88  let MicroOpBufferSize = 128; // 128 micro-op re-order buffer
89  let LoadLatency       =   4; // Optimistic load latency
90  let MispredictPenalty =  16; // Fetch + Decode/Rename/Dispatch + Branch
91
92  // Enable partial & runtime unrolling.
93  let LoopMicroOpBufferSize = 16;
94  let CompleteModel = 1;
95
96  // FIXME: Remove when all errors have been fixed.
97  let FullInstRWOverlapCheck = 0;
98}
99
100//===----------------------------------------------------------------------===//
101// Define each kind of processor resource and number available on Cortex-A57.
102// Cortex A-57 has 8 pipelines that each has its own 8-entry queue where
103// micro-ops wait for their operands and then issue out-of-order.
104
105def A57UnitB : ProcResource<1>;  // Type B micro-ops
106def A57UnitI : ProcResource<2>;  // Type I micro-ops
107def A57UnitM : ProcResource<1>;  // Type M micro-ops
108def A57UnitL : ProcResource<1>;  // Type L micro-ops
109def A57UnitS : ProcResource<1>;  // Type S micro-ops
110
111def A57UnitX : ProcResource<1>;  // Type X micro-ops (F1)
112def A57UnitW : ProcResource<1>;  // Type W micro-ops (F0)
113
114let SchedModel = CortexA57Model in {
115  def A57UnitV : ProcResGroup<[A57UnitX, A57UnitW]>;    // Type V micro-ops
116}
117
118let SchedModel = CortexA57Model in {
119
120//===----------------------------------------------------------------------===//
121// Define customized scheduler read/write types specific to the Cortex-A57.
122
123include "ARMScheduleA57WriteRes.td"
124
125// To have "CompleteModel = 1", support of pseudos and special instructions
126def : InstRW<[WriteNoop], (instregex "(t)?BKPT$", "(t2)?CDP(2)?$",
127  "(t2)?CLREX$", "CONSTPOOL_ENTRY$", "COPY_STRUCT_BYVAL_I32$",
128  "(t2)?CPS[123]p$", "(t2)?DBG$", "(t2)?DMB$", "(t2)?DSB$", "ERET$",
129  "(t2|t)?HINT$", "(t)?HLT$", "(t2)?HVC$", "(t2)?ISB$", "ITasm$",
130  "(t2)?RFE(DA|DB|IA|IB)", "(t)?SETEND", "(t2)?SETPAN", "(t2)?SMC", "SPACE",
131  "(t2)?SRS(DA|DB|IA|IB)", "SWP(B)?", "t?TRAP", "(t2|t)?UDF$", "t2DCPS", "t2SG",
132  "t2TT", "tCPS", "CMP_SWAP", "t?SVC", "t2IT", "CompilerBarrier",
133  "t__brkdiv0")>;
134
135def : InstRW<[WriteNoop], (instregex "VMRS", "VMSR", "FMSTAT")>;
136
137// Specific memory instrs
138def : InstRW<[WriteNoop, WriteNoop], (instregex "(t2)?LDA", "(t2)?LDC", "(t2)?STC",
139  "(t2)?STL", "(t2)?LDREX", "(t2)?STREX", "MEMCPY")>;
140
141// coprocessor moves
142def : InstRW<[WriteNoop, WriteNoop], (instregex
143  "(t2)?MCR(2|R|R2)?$", "(t2)?MRC(2)?$",
144  "(t2)?MRRC(2)?$", "(t2)?MRS(banked|sys|_AR|_M|sys_AR)?$",
145  "(t2)?MSR(banked|i|_AR|_M)?$")>;
146
147// Deprecated instructions
148def : InstRW<[WriteNoop], (instregex "FLDM", "FSTM")>;
149
150// Pseudos
151def : InstRW<[WriteNoop], (instregex "(t2)?ABS$",
152  "(t)?ADJCALLSTACKDOWN$", "(t)?ADJCALLSTACKUP$", "(t2|t)?Int_eh_sjlj",
153  "tLDRpci_pic", "(t2)?SUBS_PC_LR",
154  "JUMPTABLE", "tInt_WIN_eh_sjlj_longjmp",
155  "VLD(1|2)LN(d|q)(WB_fixed_|WB_register_)?Asm",
156  "VLD(3|4)(DUP|LN)?(d|q)(WB_fixed_|WB_register_)?Asm",
157  "VST(1|2)LN(d|q)(WB_fixed_|WB_register_)?Asm",
158  "VST(3|4)(DUP|LN)?(d|q)(WB_fixed_|WB_register_)?Asm",
159  "WIN__CHKSTK", "WIN__DBZCHK")>;
160
161// Miscellaneous
162// -----------------------------------------------------------------------------
163
164def : InstRW<[A57Write_1cyc_1I], (instrs COPY)>;
165
166// --- 3.2 Branch Instructions ---
167// B, BX, BL, BLX (imm, reg != LR, reg == LR), CBZ, CBNZ
168
169def : InstRW<[A57Write_1cyc_1B], (instregex "(t2|t)?B$", "t?BX", "(t2|t)?Bcc$",
170  "t?TAILJMP(d|r)", "TCRETURN(d|r)i", "tBfar", "tCBN?Z")>;
171def : InstRW<[A57Write_1cyc_1B_1I],
172  (instregex "t?BL$", "BL_pred$", "t?BLXi", "t?TPsoft")>;
173def : InstRW<[A57Write_2cyc_1B_1I], (instregex "BLX", "tBLX(NS)?r")>;
174// Pseudos
175def : InstRW<[A57Write_2cyc_1B_1I], (instregex "BCCi64", "BCCZi64")>;
176def : InstRW<[A57Write_3cyc_1B_1I], (instregex "BR_JTadd", "t?BR_JTr",
177  "t2BR_JT", "t2BXJ", "(t2)?TB(B|H)(_JT)?$", "tBRIND")>;
178def : InstRW<[A57Write_6cyc_1B_1L], (instregex "BR_JTm")>;
179
180// --- 3.3 Arithmetic and Logical Instructions ---
181// ADD{S}, ADC{S}, ADR,	AND{S},	BIC{S},	CMN, CMP, EOR{S}, ORN{S}, ORR{S},
182// RSB{S}, RSC{S}, SUB{S}, SBC{S}, TEQ, TST
183
184def : InstRW<[A57Write_1cyc_1I], (instregex "tADDframe")>;
185
186// shift by register, conditional or unconditional
187// TODO: according to the doc, conditional uses I0/I1, unconditional uses M
188// Why more complex instruction uses more simple pipeline?
189// May be an error in doc.
190def A57WriteALUsi : SchedWriteVariant<[
191  // lsl #2, lsl #1, or lsr #1.
192  SchedVar<IsPredicatedPred, [A57Write_2cyc_1M]>,
193  SchedVar<NoSchedPred,      [A57Write_2cyc_1M]>
194]>;
195def A57WriteALUsr : SchedWriteVariant<[
196  SchedVar<IsPredicatedPred, [A57Write_2cyc_1I]>,
197  SchedVar<NoSchedPred,      [A57Write_2cyc_1M]>
198]>;
199def A57WriteALUSsr : SchedWriteVariant<[
200  SchedVar<IsPredicatedPred, [A57Write_2cyc_1I]>,
201  SchedVar<NoSchedPred,      [A57Write_2cyc_1M]>
202]>;
203def A57ReadALUsr : SchedReadVariant<[
204  SchedVar<IsPredicatedPred, [ReadDefault]>,
205  SchedVar<NoSchedPred,      [ReadDefault]>
206]>;
207def : SchedAlias<WriteALUsi,  A57WriteALUsi>;
208def : SchedAlias<WriteALUsr,  A57WriteALUsr>;
209def : SchedAlias<WriteALUSsr, A57WriteALUSsr>;
210def : SchedAlias<ReadALUsr,   A57ReadALUsr>;
211
212def A57WriteCMPsr : SchedWriteVariant<[
213  SchedVar<IsPredicatedPred, [A57Write_2cyc_1I]>,
214  SchedVar<NoSchedPred,      [A57Write_2cyc_1M]>
215]>;
216def : SchedAlias<WriteCMP,   A57Write_1cyc_1I>;
217def : SchedAlias<WriteCMPsi, A57Write_2cyc_1M>;
218def : SchedAlias<WriteCMPsr, A57WriteCMPsr>;
219
220// --- 3.4 Move and Shift Instructions ---
221// Move, basic
222// MOV{S}, MOVW, MVN{S}
223def : InstRW<[A57Write_1cyc_1I], (instregex "MOV(r|i|i16|r_TC)",
224  "(t2)?MVN(CC)?(r|i)", "BMOVPCB_CALL", "BMOVPCRX_CALL",
225  "MOVCC(r|i|i16|i32imm)", "tMOV", "tMVN")>;
226
227// Move, shift by immed, setflags/no setflags
228// (ASR, LSL, LSR, ROR, RRX)=MOVsi, MVN
229// setflags = isCPSRDefined
230def A57WriteMOVsi : SchedWriteVariant<[
231  SchedVar<IsCPSRDefinedPred,              [A57Write_2cyc_1M]>,
232  SchedVar<NoSchedPred,                    [A57Write_1cyc_1I]>
233]>;
234def : InstRW<[A57WriteMOVsi], (instregex "MOV(CC)?si", "MVNsi",
235  "ASRi", "(t2|t)ASRri", "LSRi", "(t2|t)LSRri", "LSLi", "(t2|t)LSLri", "RORi",
236  "(t2|t)RORri", "(t2)?RRX", "t2MOV", "tROR")>;
237
238// shift by register, conditional or unconditional, setflags/no setflags
239def A57WriteMOVsr : SchedWriteVariant<[
240  SchedVar<IsCPSRDefinedAndPredicatedPred, [A57Write_2cyc_1I]>,
241  SchedVar<IsCPSRDefinedPred,              [A57Write_2cyc_1M]>,
242  SchedVar<IsPredicatedPred,               [A57Write_2cyc_1I]>,
243  SchedVar<NoSchedPred,                    [A57Write_1cyc_1I]>
244]>;
245def : InstRW<[A57WriteMOVsr], (instregex "MOV(CC)?sr", "MVNsr", "t2MVNs",
246  "ASRr", "(t2|t)ASRrr", "LSRr", "(t2|t)LSRrr", "LSLr", "(t2|t)?LSLrr", "RORr",
247  "(t2|t)RORrr")>;
248
249// Move, top
250// MOVT - A57Write_2cyc_1M for r0px, A57Write_1cyc_1I for r1p0 and later
251def A57WriteMOVT : SchedWriteVariant<[
252  SchedVar<IsR1P0AndLaterPred,             [A57Write_1cyc_1I]>,
253  SchedVar<NoSchedPred,                    [A57Write_2cyc_1M]>
254]>;
255def : InstRW<[A57WriteMOVT], (instregex "MOVTi16")>;
256
257def A57WriteI2pc :
258  WriteSequence<[A57Write_1cyc_1I, A57Write_1cyc_1I, A57Write_1cyc_1I]>;
259def A57WriteI2ld :
260  WriteSequence<[A57Write_1cyc_1I, A57Write_1cyc_1I, A57Write_4cyc_1L]>;
261def : InstRW< [A57WriteI2pc], (instregex "MOV_ga_pcrel")>;
262def : InstRW< [A57WriteI2ld], (instregex "MOV_ga_pcrel_ldr")>;
263
264// +2cyc for branch forms
265def : InstRW<[A57Write_3cyc_1I], (instregex "MOVPC(LR|RX)")>;
266
267// --- 3.5 Divide and Multiply Instructions ---
268// Divide: SDIV, UDIV
269// latency from documentration: 4 ­‐ 20, maximum taken
270def : SchedAlias<WriteDIV, A57Write_20cyc_1M>;
271// Multiply: tMul not bound to common WriteRes types
272def : InstRW<[A57Write_3cyc_1M], (instregex "tMUL")>;
273def : SchedAlias<WriteMUL16, A57Write_3cyc_1M>;
274def : SchedAlias<WriteMUL32, A57Write_3cyc_1M>;
275def : ReadAdvance<ReadMUL, 0>;
276
277// Multiply accumulate: MLA, MLS, SMLABB, SMLABT, SMLATB, SMLATT, SMLAWB,
278// SMLAWT, SMLAD{X}, SMLSD{X}, SMMLA{R}, SMMLS{R}
279// Multiply-accumulate pipelines support late-forwarding of accumulate operands
280// from similar μops, allowing a typical sequence of multiply-accumulate μops
281// to issue one every 1 cycle (sched advance = 2).
282def A57WriteMLA : SchedWriteRes<[A57UnitM]> { let Latency = 3; }
283def A57WriteMLAL : SchedWriteRes<[A57UnitM]> { let Latency = 4; }
284def A57ReadMLA  : SchedReadAdvance<2, [A57WriteMLA, A57WriteMLAL]>;
285
286def : InstRW<[A57WriteMLA],
287  (instregex "t2SMLAD", "t2SMLADX", "t2SMLSD", "t2SMLSDX")>;
288
289def : SchedAlias<WriteMAC16, A57WriteMLA>;
290def : SchedAlias<WriteMAC32, A57WriteMLA>;
291def : SchedAlias<ReadMAC,    A57ReadMLA>;
292
293def : SchedAlias<WriteMAC64Lo, A57WriteMLAL>;
294def : SchedAlias<WriteMAC64Hi, A57WriteMLAL>;
295
296// Multiply long: SMULL, UMULL
297def : SchedAlias<WriteMUL64Lo, A57Write_4cyc_1M>;
298def : SchedAlias<WriteMUL64Hi, A57Write_4cyc_1M>;
299
300// --- 3.6 Saturating and Parallel Arithmetic Instructions ---
301// Parallel	arith
302// SADD16, SADD8, SSUB16, SSUB8, UADD16, UADD8, USUB16, USUB8
303// Conditional GE-setting instructions require three extra μops
304// and two additional cycles to conditionally update the GE field.
305def A57WriteParArith : SchedWriteVariant<[
306  SchedVar<IsPredicatedPred, [A57Write_4cyc_1I_1M]>,
307  SchedVar<NoSchedPred,      [A57Write_2cyc_1I_1M]>
308]>;
309def : InstRW< [A57WriteParArith], (instregex
310  "(t2)?SADD(16|8)", "(t2)?SSUB(16|8)",
311  "(t2)?UADD(16|8)", "(t2)?USUB(16|8)")>;
312
313// Parallel	arith with exchange: SASX, SSAX, UASX, USAX
314def A57WriteParArithExch : SchedWriteVariant<[
315  SchedVar<IsPredicatedPred, [A57Write_5cyc_1I_1M]>,
316  SchedVar<NoSchedPred,      [A57Write_3cyc_1I_1M]>
317]>;
318def : InstRW<[A57WriteParArithExch],
319  (instregex "(t2)?SASX", "(t2)?SSAX", "(t2)?UASX", "(t2)?USAX")>;
320
321// Parallel	halving	arith
322// SHADD16, SHADD8, SHSUB16, SHSUB8, UHADD16, UHADD8, UHSUB16,	UHSUB8
323def : InstRW<[A57Write_2cyc_1M], (instregex
324  "(t2)?SHADD(16|8)", "(t2)?SHSUB(16|8)",
325  "(t2)?UHADD(16|8)", "(t2)?UHSUB(16|8)")>;
326
327// Parallel halving arith with exchange
328// SHASX, SHSAX, UHASX, UHSAX
329def : InstRW<[A57Write_3cyc_1I_1M], (instregex "(t2)?SHASX", "(t2)?SHSAX",
330  "(t2)?UHASX", "(t2)?UHSAX")>;
331
332// Parallel	saturating arith
333// QADD16, QADD8, QSUB16, QSUB8, UQADD16, UQADD8, UQSUB16, UQSUB8
334def : InstRW<[A57Write_2cyc_1M], (instregex "QADD(16|8)", "QSUB(16|8)",
335  "UQADD(16|8)", "UQSUB(16|8)", "t2(U?)QADD", "t2(U?)QSUB")>;
336
337// Parallel	saturating arith with exchange
338// QASX, QSAX, UQASX, UQSAX
339def : InstRW<[A57Write_3cyc_1I_1M], (instregex "(t2)?QASX", "(t2)?QSAX",
340  "(t2)?UQASX", "(t2)?UQSAX")>;
341
342// Saturate: SSAT, SSAT16, USAT, USAT16
343def : InstRW<[A57Write_2cyc_1M],
344  (instregex "(t2)?SSAT(16)?", "(t2)?USAT(16)?")>;
345
346// Saturating arith: QADD, QSUB
347def : InstRW<[A57Write_2cyc_1M], (instregex "QADD$", "QSUB$")>;
348
349// Saturating doubling arith: QDADD, QDSUB
350def : InstRW<[A57Write_3cyc_1I_1M], (instregex "(t2)?QDADD", "(t2)?QDSUB")>;
351
352// --- 3.7 Miscellaneous Data-Processing Instructions ---
353// Bit field extract: SBFX, UBFX
354def : InstRW<[A57Write_1cyc_1I], (instregex "(t2)?SBFX", "(t2)?UBFX")>;
355
356// Bit field insert/clear: BFI, BFC
357def : InstRW<[A57Write_2cyc_1M], (instregex "(t2)?BFI", "(t2)?BFC")>;
358
359// Select bytes, conditional/unconditional
360def A57WriteSEL : SchedWriteVariant<[
361  SchedVar<IsPredicatedPred, [A57Write_2cyc_1I]>,
362  SchedVar<NoSchedPred,      [A57Write_1cyc_1I]>
363]>;
364def : InstRW<[A57WriteSEL], (instregex "(t2)?SEL")>;
365
366// Sign/zero extend, normal: SXTB, SXTH, UXTB, UXTH
367def : InstRW<[A57Write_1cyc_1I],
368  (instregex "(t2|t)?SXT(B|H)$", "(t2|t)?UXT(B|H)$")>;
369
370// Sign/zero extend and add, normal: SXTAB, SXTAH, UXTAB, UXTAH
371def : InstRW<[A57Write_2cyc_1M],
372  (instregex "(t2)?SXTA(B|H)$", "(t2)?UXTA(B|H)$")>;
373
374// Sign/zero extend and add, parallel: SXTAB16, UXTAB16
375def : InstRW<[A57Write_4cyc_1M], (instregex "(t2)?SXTAB16", "(t2)?UXTAB16")>;
376
377// Sum of absolute differences: USAD8, USADA8
378def : InstRW<[A57Write_3cyc_1M], (instregex "(t2)?USAD8", "(t2)?USADA8")>;
379
380// --- 3.8 Load Instructions ---
381
382// Load, immed offset
383// LDR and LDRB have LDRi12 and LDRBi12 forms for immediate
384def : InstRW<[A57Write_4cyc_1L], (instregex "LDRi12", "LDRBi12",
385  "LDRcp", "(t2|t)?LDRConstPool", "LDRLIT_ga_(pcrel|abs)",
386  "PICLDR", "tLDR")>;
387
388def : InstRW<[A57Write_4cyc_1L],
389  (instregex "t2LDRS?(B|H)?(pcrel|T|i8|i12|pci|pci_pic|s)?$")>;
390
391// For "Load, register offset, minus" we need +1cyc, +1I
392def A57WriteLdrAm3 : SchedWriteVariant<[
393  SchedVar<IsLdrAm3NegRegOffPred, [A57Write_5cyc_1I_1L]>,
394  SchedVar<NoSchedPred,           [A57Write_4cyc_1L]>
395]>;
396def : InstRW<[A57WriteLdrAm3], (instregex "LDR(H|SH|SB)$")>;
397def A57WriteLdrAm3X2 : SchedWriteVariant<[
398  SchedVar<IsLdrAm3NegRegOffPredX2, [A57Write_5cyc_1I_1L]>,
399  SchedVar<NoSchedPred,             [A57Write_4cyc_1L]>
400]>;
401def : InstRW<[A57WriteLdrAm3X2, A57WriteLdrAm3X2], (instregex "LDRD$")>;
402def : InstRW<[A57Write_4cyc_1L, A57Write_4cyc_1L], (instregex "t2LDRDi8")>;
403
404def A57WriteLdrAmLDSTSO : SchedWriteVariant<[
405  SchedVar<IsLdstsoScaledNotOptimalPred, [A57Write_5cyc_1I_1L]>,
406  SchedVar<IsLdstsoMinusRegPred,         [A57Write_5cyc_1I_1L]>,
407  SchedVar<NoSchedPred,                  [A57Write_4cyc_1L]>
408]>;
409def : InstRW<[A57WriteLdrAmLDSTSO], (instregex "LDRrs", "LDRBrs")>;
410
411def A57WrBackOne : SchedWriteRes<[]> {
412  let Latency = 1;
413  let NumMicroOps = 0;
414}
415def A57WrBackTwo : SchedWriteRes<[]> {
416  let Latency = 2;
417  let NumMicroOps = 0;
418}
419def A57WrBackThree : SchedWriteRes<[]> {
420  let Latency = 3;
421  let NumMicroOps = 0;
422}
423
424// --- LDR pre-indexed ---
425// Load, immed pre-indexed (4 cyc for load result, 1 cyc for Base update)
426def : InstRW<[A57Write_4cyc_1L_1I, A57WrBackOne], (instregex "LDR_PRE_IMM",
427  "LDRB_PRE_IMM", "t2LDRB_PRE")>;
428
429// Load, register pre-indexed (4 cyc for load result, 2 cyc for Base update)
430// (5 cyc load result for not-lsl2 scaled)
431def A57WriteLdrAmLDSTSOPre : SchedWriteVariant<[
432  SchedVar<IsLdstsoScaledNotOptimalPredX2, [A57Write_5cyc_1I_1L]>,
433  SchedVar<NoSchedPred,                    [A57Write_4cyc_1L_1I]>
434]>;
435def : InstRW<[A57WriteLdrAmLDSTSOPre, A57WrBackTwo],
436  (instregex "LDR_PRE_REG", "LDRB_PRE_REG")>;
437
438def A57WriteLdrAm3PreWrBack : SchedWriteVariant<[
439  SchedVar<IsLdrAm3RegOffPredX2, [A57WrBackTwo]>,
440  SchedVar<NoSchedPred,          [A57WrBackOne]>
441]>;
442def : InstRW<[A57Write_4cyc_1L, A57WriteLdrAm3PreWrBack],
443  (instregex "LDR(H|SH|SB)_PRE")>;
444def : InstRW<[A57Write_4cyc_1L, A57WrBackOne],
445  (instregex "t2LDR(H|SH|SB)?_PRE")>;
446
447// LDRD pre-indexed: 5(2) cyc for reg, 4(1) cyc for imm.
448def A57WriteLdrDAm3Pre : SchedWriteVariant<[
449  SchedVar<IsLdrAm3RegOffPredX3, [A57Write_5cyc_1I_1L]>,
450  SchedVar<NoSchedPred,          [A57Write_4cyc_1L_1I]>
451]>;
452def A57WriteLdrDAm3PreWrBack : SchedWriteVariant<[
453  SchedVar<IsLdrAm3RegOffPredX3, [A57WrBackTwo]>,
454  SchedVar<NoSchedPred,          [A57WrBackOne]>
455]>;
456def : InstRW<[A57WriteLdrDAm3Pre, A57WriteLdrDAm3Pre, A57WriteLdrDAm3PreWrBack],
457  (instregex "LDRD_PRE")>;
458def : InstRW<[A57Write_4cyc_1L_1I, A57Write_4cyc_1L_1I, A57WrBackOne],
459  (instregex "t2LDRD_PRE")>;
460
461// --- LDR post-indexed ---
462def : InstRW<[A57Write_4cyc_1L_1I, A57WrBackOne], (instregex "LDR(T?)_POST_IMM",
463  "LDRB(T?)_POST_IMM", "LDR(SB|H|SH)Ti", "t2LDRB_POST")>;
464
465def A57WriteLdrAm3PostWrBack : SchedWriteVariant<[
466  SchedVar<IsLdrAm3RegOffPred, [A57WrBackTwo]>,
467  SchedVar<NoSchedPred,        [A57WrBackOne]>
468]>;
469def : InstRW<[A57Write_4cyc_1L_1I, A57WriteLdrAm3PostWrBack],
470  (instregex "LDR(H|SH|SB)_POST")>;
471def : InstRW<[A57Write_4cyc_1L, A57WrBackOne],
472  (instregex "t2LDR(H|SH|SB)?_POST")>;
473
474def : InstRW<[A57Write_4cyc_1L_1I, A57WrBackTwo], (instregex "LDR_POST_REG",
475  "LDRB_POST_REG", "LDR(B?)T_POST$")>;
476
477def A57WriteLdrTRegPost : SchedWriteVariant<[
478  SchedVar<IsLdrAm2ScaledPred, [A57Write_4cyc_1I_1L_1M]>,
479  SchedVar<NoSchedPred,        [A57Write_4cyc_1L_1I]>
480]>;
481def A57WriteLdrTRegPostWrBack : SchedWriteVariant<[
482  SchedVar<IsLdrAm2ScaledPred, [A57WrBackThree]>,
483  SchedVar<NoSchedPred,        [A57WrBackTwo]>
484]>;
485// 4(3) "I0/I1,L,M" for scaled register, otherwise 4(2) "I0/I1,L"
486def : InstRW<[A57WriteLdrTRegPost, A57WriteLdrTRegPostWrBack],
487  (instregex "LDRT_POST_REG", "LDRBT_POST_REG")>;
488
489def : InstRW<[A57Write_4cyc_1L_1I, A57WrBackTwo], (instregex "LDR(SB|H|SH)Tr")>;
490
491def A57WriteLdrAm3PostWrBackX3 : SchedWriteVariant<[
492  SchedVar<IsLdrAm3RegOffPredX3, [A57WrBackTwo]>,
493  SchedVar<NoSchedPred,          [A57WrBackOne]>
494]>;
495// LDRD post-indexed: 4(2) cyc for reg, 4(1) cyc for imm.
496def : InstRW<[A57Write_4cyc_1L_1I, A57Write_4cyc_1L_1I,
497  A57WriteLdrAm3PostWrBackX3], (instregex "LDRD_POST")>;
498def : InstRW<[A57Write_4cyc_1L_1I, A57Write_4cyc_1L_1I, A57WrBackOne],
499  (instregex "t2LDRD_POST")>;
500
501// --- Preload instructions ---
502// Preload, immed offset
503def : InstRW<[A57Write_4cyc_1L], (instregex "(t2)?PLDi12", "(t2)?PLDWi12",
504  "t2PLDW?(i8|pci|s)", "(t2)?PLI")>;
505
506// Preload, register offset,
507// 5cyc "I0/I1,L" for minus reg or scaled not plus lsl2
508// otherwise 4cyc "L"
509def A57WritePLD : SchedWriteVariant<[
510  SchedVar<IsLdstsoScaledNotOptimalPredX0, [A57Write_5cyc_1I_1L]>,
511  SchedVar<IsLdstsoMinusRegPredX0,         [A57Write_5cyc_1I_1L]>,
512  SchedVar<NoSchedPred,                    [A57Write_4cyc_1L]>
513]>;
514def : InstRW<[A57WritePLD], (instregex "PLDrs", "PLDWrs")>;
515
516// --- Load multiple instructions ---
517foreach NumAddr = 1-8 in {
518  def A57LMAddrPred#NumAddr :
519    SchedPredicate<"(TII->getLDMVariableDefsSize(*MI)+1)/2 == "#NumAddr>;
520}
521
522def A57LDMOpsListNoregin : A57WriteLMOpsListType<
523                [A57Write_3cyc_1L, A57Write_3cyc_1L,
524                 A57Write_4cyc_1L, A57Write_4cyc_1L,
525                 A57Write_5cyc_1L, A57Write_5cyc_1L,
526                 A57Write_6cyc_1L, A57Write_6cyc_1L,
527                 A57Write_7cyc_1L, A57Write_7cyc_1L,
528                 A57Write_8cyc_1L, A57Write_8cyc_1L,
529                 A57Write_9cyc_1L, A57Write_9cyc_1L,
530                 A57Write_10cyc_1L, A57Write_10cyc_1L]>;
531def A57WriteLDMnoreginlist : SchedWriteVariant<[
532  SchedVar<A57LMAddrPred1,     A57LDMOpsListNoregin.Writes[0-1]>,
533  SchedVar<A57LMAddrPred2,     A57LDMOpsListNoregin.Writes[0-3]>,
534  SchedVar<A57LMAddrPred3,     A57LDMOpsListNoregin.Writes[0-5]>,
535  SchedVar<A57LMAddrPred4,     A57LDMOpsListNoregin.Writes[0-7]>,
536  SchedVar<A57LMAddrPred5,     A57LDMOpsListNoregin.Writes[0-9]>,
537  SchedVar<A57LMAddrPred6,     A57LDMOpsListNoregin.Writes[0-11]>,
538  SchedVar<A57LMAddrPred7,     A57LDMOpsListNoregin.Writes[0-13]>,
539  SchedVar<A57LMAddrPred8,     A57LDMOpsListNoregin.Writes[0-15]>,
540  SchedVar<NoSchedPred,        A57LDMOpsListNoregin.Writes[0-15]>
541]> { let Variadic=1; }
542
543def A57LDMOpsListRegin : A57WriteLMOpsListType<
544                [A57Write_4cyc_1L_1I, A57Write_4cyc_1L_1I,
545                 A57Write_5cyc_1L_1I, A57Write_5cyc_1L_1I,
546                 A57Write_6cyc_1L_1I, A57Write_6cyc_1L_1I,
547                 A57Write_7cyc_1L_1I, A57Write_7cyc_1L_1I,
548                 A57Write_8cyc_1L_1I, A57Write_8cyc_1L_1I,
549                 A57Write_9cyc_1L_1I, A57Write_9cyc_1L_1I,
550                 A57Write_10cyc_1L_1I, A57Write_10cyc_1L_1I,
551                 A57Write_11cyc_1L_1I, A57Write_11cyc_1L_1I]>;
552def A57WriteLDMreginlist : SchedWriteVariant<[
553  SchedVar<A57LMAddrPred1,     A57LDMOpsListRegin.Writes[0-1]>,
554  SchedVar<A57LMAddrPred2,     A57LDMOpsListRegin.Writes[0-3]>,
555  SchedVar<A57LMAddrPred3,     A57LDMOpsListRegin.Writes[0-5]>,
556  SchedVar<A57LMAddrPred4,     A57LDMOpsListRegin.Writes[0-7]>,
557  SchedVar<A57LMAddrPred5,     A57LDMOpsListRegin.Writes[0-9]>,
558  SchedVar<A57LMAddrPred6,     A57LDMOpsListRegin.Writes[0-11]>,
559  SchedVar<A57LMAddrPred7,     A57LDMOpsListRegin.Writes[0-13]>,
560  SchedVar<A57LMAddrPred8,     A57LDMOpsListRegin.Writes[0-15]>,
561  SchedVar<NoSchedPred,        A57LDMOpsListRegin.Writes[0-15]>
562]> { let Variadic=1; }
563
564def A57LDMOpsList_Upd : A57WriteLMOpsListType<
565              [A57WrBackOne,
566               A57Write_3cyc_1L_1I, A57Write_3cyc_1L_1I,
567               A57Write_4cyc_1L_1I, A57Write_4cyc_1L_1I,
568               A57Write_5cyc_1L_1I, A57Write_5cyc_1L_1I,
569               A57Write_6cyc_1L_1I, A57Write_6cyc_1L_1I,
570               A57Write_7cyc_1L_1I, A57Write_7cyc_1L_1I,
571               A57Write_8cyc_1L_1I, A57Write_8cyc_1L_1I,
572               A57Write_9cyc_1L_1I, A57Write_9cyc_1L_1I,
573               A57Write_10cyc_1L_1I, A57Write_10cyc_1L_1I]>;
574def A57WriteLDM_Upd : SchedWriteVariant<[
575  SchedVar<A57LMAddrPred1,     A57LDMOpsList_Upd.Writes[0-2]>,
576  SchedVar<A57LMAddrPred2,     A57LDMOpsList_Upd.Writes[0-4]>,
577  SchedVar<A57LMAddrPred3,     A57LDMOpsList_Upd.Writes[0-6]>,
578  SchedVar<A57LMAddrPred4,     A57LDMOpsList_Upd.Writes[0-8]>,
579  SchedVar<A57LMAddrPred5,     A57LDMOpsList_Upd.Writes[0-10]>,
580  SchedVar<A57LMAddrPred6,     A57LDMOpsList_Upd.Writes[0-12]>,
581  SchedVar<A57LMAddrPred7,     A57LDMOpsList_Upd.Writes[0-14]>,
582  SchedVar<A57LMAddrPred8,     A57LDMOpsList_Upd.Writes[0-16]>,
583  SchedVar<NoSchedPred,        A57LDMOpsList_Upd.Writes[0-16]>
584]> { let Variadic=1; }
585
586def A57WriteLDM : SchedWriteVariant<[
587  SchedVar<IsLdmBaseRegInList, [A57WriteLDMreginlist]>,
588  SchedVar<NoSchedPred,        [A57WriteLDMnoreginlist]>
589]> { let Variadic=1; }
590
591def : InstRW<[A57WriteLDM], (instregex "(t|t2|sys)?LDM(IA|DA|DB|IB)$")>;
592
593// TODO: no writeback latency defined in documentation (implemented as 1 cyc)
594def : InstRW<[A57WriteLDM_Upd],
595  (instregex "(t|t2|sys)?LDM(IA_UPD|DA_UPD|DB_UPD|IB_UPD|IA_RET)", "tPOP")>;
596
597def : InstRW<[A57Write_5cyc_1L], (instregex "VLLDM")>;
598
599// --- 3.9 Store Instructions ---
600
601// Store, immed offset
602def : InstRW<[A57Write_1cyc_1S], (instregex "STRi12", "STRBi12", "PICSTR",
603  "t2STR(B?)(T|i12|i8|s)", "t2STRDi8", "t2STRH(i12|i8|s)", "tSTR")>;
604
605// Store, register offset
606// For minus or for not plus lsl2 scaled we need 3cyc "I0/I1, S",
607// otherwise 1cyc S.
608def A57WriteStrAmLDSTSO : SchedWriteVariant<[
609  SchedVar<IsLdstsoScaledNotOptimalPred, [A57Write_3cyc_1I_1S]>,
610  SchedVar<IsLdstsoMinusRegPred,         [A57Write_3cyc_1I_1S]>,
611  SchedVar<NoSchedPred,                  [A57Write_1cyc_1S]>
612]>;
613def : InstRW<[A57WriteStrAmLDSTSO], (instregex "STRrs", "STRBrs")>;
614
615// STRH,STRD: 3cyc "I0/I1, S" for minus reg, 1cyc S for imm or for plus reg.
616def A57WriteStrAm3 : SchedWriteVariant<[
617  SchedVar<IsLdrAm3NegRegOffPred, [A57Write_3cyc_1I_1S]>,
618  SchedVar<NoSchedPred,           [A57Write_1cyc_1S]>
619]>;
620def : InstRW<[A57WriteStrAm3], (instregex "STRH$")>;
621def A57WriteStrAm3X2 : SchedWriteVariant<[
622  SchedVar<IsLdrAm3NegRegOffPredX2, [A57Write_3cyc_1I_1S]>,
623  SchedVar<NoSchedPred,             [A57Write_1cyc_1S]>
624]>;
625def : InstRW<[A57WriteStrAm3X2], (instregex "STRD$")>;
626
627// Store, immed pre-indexed (1cyc "S, I0/I1", 1cyc writeback)
628def : InstRW<[A57WrBackOne, A57Write_1cyc_1S_1I], (instregex "STR_PRE_IMM",
629  "STRB_PRE_IMM", "STR(B)?(r|i)_preidx", "(t2)?STRH_(preidx|PRE)",
630  "t2STR(B?)_(PRE|preidx)", "t2STRD_PRE")>;
631
632// Store, register pre-indexed:
633// 1(1) "S, I0/I1" for plus reg
634// 3(2) "I0/I1, S" for minus reg
635// 1(2) "S, M" for scaled plus lsl2
636// 3(2) "I0/I1, S" for other scaled
637def A57WriteStrAmLDSTSOPre : SchedWriteVariant<[
638  SchedVar<IsLdstsoScaledNotOptimalPredX2, [A57Write_3cyc_1I_1S]>,
639  SchedVar<IsLdstsoMinusRegPredX2,         [A57Write_3cyc_1I_1S]>,
640  SchedVar<IsLdstsoScaledPredX2,           [A57Write_1cyc_1S_1M]>,
641  SchedVar<NoSchedPred,                    [A57Write_1cyc_1S_1I]>
642]>;
643def A57WriteStrAmLDSTSOPreWrBack : SchedWriteVariant<[
644  SchedVar<IsLdstsoScaledPredX2,           [A57WrBackTwo]>,
645  SchedVar<IsLdstsoMinusRegPredX2,         [A57WrBackTwo]>,
646  SchedVar<NoSchedPred,                    [A57WrBackOne]>
647]>;
648def : InstRW<[A57WriteStrAmLDSTSOPreWrBack, A57WriteStrAmLDSTSOPre],
649  (instregex "STR_PRE_REG", "STRB_PRE_REG")>;
650
651// pre-indexed STRH/STRD (STRH_PRE, STRD_PRE)
652// 1(1) "S, I0/I1" for imm or reg plus
653// 3(2) "I0/I1, S" for reg minus
654def A57WriteStrAm3PreX2 : SchedWriteVariant<[
655  SchedVar<IsLdrAm3NegRegOffPredX2, [A57Write_3cyc_1I_1S]>,
656  SchedVar<NoSchedPred,             [A57Write_1cyc_1S_1I]>
657]>;
658def A57WriteStrAm3PreWrBackX2 : SchedWriteVariant<[
659  SchedVar<IsLdrAm3NegRegOffPredX2, [A57WrBackTwo]>,
660  SchedVar<NoSchedPred,             [A57WrBackOne]>
661]>;
662def : InstRW<[A57WriteStrAm3PreWrBackX2, A57WriteStrAm3PreX2],
663  (instregex "STRH_PRE")>;
664
665def A57WriteStrAm3PreX3 : SchedWriteVariant<[
666  SchedVar<IsLdrAm3NegRegOffPredX3, [A57Write_3cyc_1I_1S]>,
667  SchedVar<NoSchedPred,             [A57Write_1cyc_1S_1I]>
668]>;
669def A57WriteStrAm3PreWrBackX3 : SchedWriteVariant<[
670  SchedVar<IsLdrAm3NegRegOffPredX3, [A57WrBackTwo]>,
671  SchedVar<NoSchedPred,             [A57WrBackOne]>
672]>;
673def : InstRW<[A57WriteStrAm3PreWrBackX3, A57WriteStrAm3PreX3],
674  (instregex "STRD_PRE")>;
675
676def : InstRW<[A57WrBackOne, A57Write_1cyc_1S_1I], (instregex "STR(T?)_POST_IMM",
677  "STRB(T?)_POST_IMM", "t2STR(B?)_POST")>;
678
679// 1(2) "S, M" for STR/STRB register post-indexed (both scaled or not)
680def : InstRW<[A57WrBackTwo, A57Write_1cyc_1S_1M], (instregex "STR(T?)_POST_REG",
681  "STRB(T?)_POST_REG", "STR(B?)T_POST$")>;
682
683// post-indexed STRH/STRD(STRH_POST, STRD_POST), STRHTi, STRHTr
684// 1(1) "S, I0/I1" both for reg or imm
685def : InstRW<[A57WrBackOne, A57Write_1cyc_1S_1I],
686  (instregex "(t2)?STR(H|D)_POST", "STRHT(i|r)", "t2STRHT")>;
687
688// --- Store multiple instructions ---
689// TODO: no writeback latency defined in documentation
690def A57WriteSTM : SchedWriteVariant<[
691    SchedVar<A57LMAddrPred1, [A57Write_1cyc_1S]>,
692    SchedVar<A57LMAddrPred2, [A57Write_2cyc_1S]>,
693    SchedVar<A57LMAddrPred3, [A57Write_3cyc_1S]>,
694    SchedVar<A57LMAddrPred4, [A57Write_4cyc_1S]>,
695    SchedVar<A57LMAddrPred5, [A57Write_5cyc_1S]>,
696    SchedVar<A57LMAddrPred6, [A57Write_6cyc_1S]>,
697    SchedVar<A57LMAddrPred7, [A57Write_7cyc_1S]>,
698    SchedVar<A57LMAddrPred8, [A57Write_8cyc_1S]>,
699    SchedVar<NoSchedPred,    [A57Write_2cyc_1S]>
700]>;
701def A57WriteSTM_Upd : SchedWriteVariant<[
702    SchedVar<A57LMAddrPred1, [A57Write_1cyc_1S_1I]>,
703    SchedVar<A57LMAddrPred2, [A57Write_2cyc_1S_1I]>,
704    SchedVar<A57LMAddrPred3, [A57Write_3cyc_1S_1I]>,
705    SchedVar<A57LMAddrPred4, [A57Write_4cyc_1S_1I]>,
706    SchedVar<A57LMAddrPred5, [A57Write_5cyc_1S_1I]>,
707    SchedVar<A57LMAddrPred6, [A57Write_6cyc_1S_1I]>,
708    SchedVar<A57LMAddrPred7, [A57Write_7cyc_1S_1I]>,
709    SchedVar<A57LMAddrPred8, [A57Write_8cyc_1S_1I]>,
710    SchedVar<NoSchedPred,    [A57Write_2cyc_1S_1I]>
711]>;
712
713def : InstRW<[A57WriteSTM], (instregex "(t2|sys|t)?STM(IA|DA|DB|IB)$")>;
714def : InstRW<[A57WrBackOne, A57WriteSTM_Upd],
715  (instregex "(t2|sys|t)?STM(IA_UPD|DA_UPD|DB_UPD|IB_UPD)", "tPUSH")>;
716
717def : InstRW<[A57Write_5cyc_1S], (instregex "VLSTM")>;
718
719// --- 3.10 FP Data Processing Instructions ---
720def : SchedAlias<WriteFPALU32, A57Write_5cyc_1V>;
721def : SchedAlias<WriteFPALU64, A57Write_5cyc_1V>;
722
723def : InstRW<[A57Write_3cyc_1V], (instregex "VABS(S|D|H)")>;
724
725// fp compare - 3cyc F1 for unconditional, 6cyc "F0/F1, F1" for conditional
726def A57WriteVcmp : SchedWriteVariant<[
727  SchedVar<IsPredicatedPred, [A57Write_6cyc_1V_1X]>,
728  SchedVar<NoSchedPred,      [A57Write_3cyc_1X]>
729]>;
730def : InstRW<[A57WriteVcmp],
731  (instregex "VCMP(D|S|H|ZD|ZS|ZH)$", "VCMPE(D|S|H|ZD|ZS|ZH)")>;
732
733// fp convert
734def : InstRW<[A57Write_5cyc_1V], (instregex
735  "VCVT(A|N|P|M)(SH|UH|SS|US|SD|UD)", "VCVT(BDH|THD|TDH)")>;
736def : InstRW<[A57Write_5cyc_1V], (instregex "VTOSLS", "VTOUHS", "VTOULS")>;
737def : SchedAlias<WriteFPCVT, A57Write_5cyc_1V>;
738
739def : InstRW<[A57Write_5cyc_1V], (instregex "VJCVT")>;
740
741// FP round to integral
742def : InstRW<[A57Write_5cyc_1V], (instregex "VRINT(A|N|P|M|Z|R|X)(H|S|D)$")>;
743
744// FP divide, FP square root
745def : SchedAlias<WriteFPDIV32, A57Write_17cyc_1W>;
746def : SchedAlias<WriteFPDIV64, A57Write_32cyc_1W>;
747def : SchedAlias<WriteFPSQRT32, A57Write_17cyc_1W>;
748def : SchedAlias<WriteFPSQRT64, A57Write_32cyc_1W>;
749
750def : InstRW<[A57Write_17cyc_1W], (instregex "VSQRTH")>;
751
752// FP max/min
753def : InstRW<[A57Write_5cyc_1V], (instregex "VMAX", "VMIN")>;
754
755// FP multiply-accumulate pipelines support late forwarding of the result
756// from FP multiply μops to the accumulate operands of an
757// FP multiply-accumulate μop. The latter can potentially be issued 1 cycle
758// after the FP multiply μop has been issued
759// FP multiply, FZ
760def A57WriteVMUL : SchedWriteRes<[A57UnitV]> { let Latency = 5; }
761
762def : SchedAlias<WriteFPMUL32, A57WriteVMUL>;
763def : SchedAlias<WriteFPMUL64, A57WriteVMUL>;
764def : ReadAdvance<ReadFPMUL, 0>;
765
766// FP multiply accumulate, FZ: 9cyc "F0/F1" or 4 cyc for sequenced accumulate
767// VFMA, VFMS, VFNMA, VFNMS, VMLA, VMLS, VNMLA, VNMLS
768def A57WriteVFMA : SchedWriteRes<[A57UnitV]> { let Latency = 9;  }
769
770// VFMA takes 9 cyc for common case and 4 cyc for VFMA->VFMA chain (5 read adv.)
771// VMUL takes 5 cyc for common case and 1 cyc for VMUL->VFMA chain (4 read adv.)
772// Currently, there is no way to define different read advances for VFMA operand
773// from VFMA or from VMUL, so there will be 5 read advance.
774// Zero latency (instead of one) for VMUL->VFMA shouldn't break something.
775// The same situation with ASIMD VMUL/VFMA instructions
776// def A57ReadVFMA : SchedRead;
777// def : ReadAdvance<A57ReadVFMA, 5, [A57WriteVFMA]>;
778// def : ReadAdvance<A57ReadVFMA, 4, [A57WriteVMUL]>;
779def A57ReadVFMA5 : SchedReadAdvance<5, [A57WriteVFMA, A57WriteVMUL]>;
780
781def : SchedAlias<WriteFPMAC32, A57WriteVFMA>;
782def : SchedAlias<WriteFPMAC64, A57WriteVFMA>;
783def : SchedAlias<ReadFPMAC, A57ReadVFMA5>;
784
785// VMLAH/VMLSH are not binded to scheduling classes by default, so here custom:
786def : InstRW<[A57WriteVFMA, A57ReadVFMA5, ReadFPMUL, ReadFPMUL],
787  (instregex "VMLAH", "VMLSH", "VNMLAH", "VNMLSH")>;
788
789def : InstRW<[A57WriteVMUL],
790  (instregex "VUDOTD", "VSDOTD", "VUDOTQ", "VSDOTQ")>;
791
792def : InstRW<[A57Write_3cyc_1V], (instregex "VNEG")>;
793def : InstRW<[A57Write_3cyc_1V], (instregex "VSEL")>;
794
795// --- 3.11 FP Miscellaneous Instructions ---
796// VMOV: 3cyc "F0/F1" for imm/reg
797def : InstRW<[A57Write_3cyc_1V], (instregex "FCONST(D|S|H)")>;
798def : InstRW<[A57Write_3cyc_1V], (instregex "VMOV(D|S|H)(cc)?$")>;
799
800def : InstRW<[A57Write_3cyc_1V], (instregex "VINSH")>;
801
802// 5cyc L for FP transfer, vfp to core reg,
803// 5cyc L for FP transfer, core reg to vfp
804def : SchedAlias<WriteFPMOV, A57Write_5cyc_1L>;
805// VMOVRRS/VMOVRRD in common code declared with one WriteFPMOV (instead of 2).
806def : InstRW<[A57Write_5cyc_1L, A57Write_5cyc_1L], (instregex "VMOV(RRS|RRD)")>;
807
808// 8cyc "L,F0/F1" for FP transfer, core reg to upper or lower half of vfp D-reg
809def : InstRW<[A57Write_8cyc_1L_1I], (instregex "VMOVDRR")>;
810
811// --- 3.12 FP Load Instructions ---
812def : InstRW<[A57Write_5cyc_1L], (instregex "VLDR(D|S|H)")>;
813
814def : InstRW<[A57Write_5cyc_1L], (instregex "VLDMQIA$")>;
815
816// FP load multiple (VLDM)
817
818def A57VLDMOpsListUncond : A57WriteLMOpsListType<
819               [A57Write_5cyc_1L, A57Write_5cyc_1L,
820                A57Write_6cyc_1L, A57Write_6cyc_1L,
821                A57Write_7cyc_1L, A57Write_7cyc_1L,
822                A57Write_8cyc_1L, A57Write_8cyc_1L,
823                A57Write_9cyc_1L, A57Write_9cyc_1L,
824                A57Write_10cyc_1L, A57Write_10cyc_1L,
825                A57Write_11cyc_1L, A57Write_11cyc_1L,
826                A57Write_12cyc_1L, A57Write_12cyc_1L]>;
827def A57WriteVLDMuncond : SchedWriteVariant<[
828  SchedVar<A57LMAddrPred1,  A57VLDMOpsListUncond.Writes[0-1]>,
829  SchedVar<A57LMAddrPred2,  A57VLDMOpsListUncond.Writes[0-3]>,
830  SchedVar<A57LMAddrPred3,  A57VLDMOpsListUncond.Writes[0-5]>,
831  SchedVar<A57LMAddrPred4,  A57VLDMOpsListUncond.Writes[0-7]>,
832  SchedVar<A57LMAddrPred5,  A57VLDMOpsListUncond.Writes[0-9]>,
833  SchedVar<A57LMAddrPred6,  A57VLDMOpsListUncond.Writes[0-11]>,
834  SchedVar<A57LMAddrPred7,  A57VLDMOpsListUncond.Writes[0-13]>,
835  SchedVar<A57LMAddrPred8,  A57VLDMOpsListUncond.Writes[0-15]>,
836  SchedVar<NoSchedPred,     A57VLDMOpsListUncond.Writes[0-15]>
837]> { let Variadic=1; }
838
839def A57VLDMOpsListCond : A57WriteLMOpsListType<
840               [A57Write_5cyc_1L, A57Write_6cyc_1L,
841                A57Write_7cyc_1L, A57Write_8cyc_1L,
842                A57Write_9cyc_1L, A57Write_10cyc_1L,
843                A57Write_11cyc_1L, A57Write_12cyc_1L,
844                A57Write_13cyc_1L, A57Write_14cyc_1L,
845                A57Write_15cyc_1L, A57Write_16cyc_1L,
846                A57Write_17cyc_1L, A57Write_18cyc_1L,
847                A57Write_19cyc_1L, A57Write_20cyc_1L]>;
848def A57WriteVLDMcond : SchedWriteVariant<[
849  SchedVar<A57LMAddrPred1,  A57VLDMOpsListCond.Writes[0-1]>,
850  SchedVar<A57LMAddrPred2,  A57VLDMOpsListCond.Writes[0-3]>,
851  SchedVar<A57LMAddrPred3,  A57VLDMOpsListCond.Writes[0-5]>,
852  SchedVar<A57LMAddrPred4,  A57VLDMOpsListCond.Writes[0-7]>,
853  SchedVar<A57LMAddrPred5,  A57VLDMOpsListCond.Writes[0-9]>,
854  SchedVar<A57LMAddrPred6,  A57VLDMOpsListCond.Writes[0-11]>,
855  SchedVar<A57LMAddrPred7,  A57VLDMOpsListCond.Writes[0-13]>,
856  SchedVar<A57LMAddrPred8,  A57VLDMOpsListCond.Writes[0-15]>,
857  SchedVar<NoSchedPred,     A57VLDMOpsListCond.Writes[0-15]>
858]> { let Variadic=1; }
859
860def A57WriteVLDM : SchedWriteVariant<[
861  SchedVar<IsPredicatedPred, [A57WriteVLDMcond]>,
862  SchedVar<NoSchedPred,      [A57WriteVLDMuncond]>
863]> { let Variadic=1; }
864
865def : InstRW<[A57WriteVLDM], (instregex "VLDM(DIA|SIA)$")>;
866
867def A57VLDMOpsListUncond_Upd : A57WriteLMOpsListType<
868               [A57Write_5cyc_1L_1I, A57Write_5cyc_1L_1I,
869                A57Write_6cyc_1L_1I, A57Write_6cyc_1L_1I,
870                A57Write_7cyc_1L_1I, A57Write_7cyc_1L_1I,
871                A57Write_8cyc_1L_1I, A57Write_8cyc_1L_1I,
872                A57Write_9cyc_1L_1I, A57Write_9cyc_1L_1I,
873                A57Write_10cyc_1L_1I, A57Write_10cyc_1L_1I,
874                A57Write_11cyc_1L_1I, A57Write_11cyc_1L_1I,
875                A57Write_12cyc_1L_1I, A57Write_12cyc_1L_1I]>;
876def A57WriteVLDMuncond_UPD : SchedWriteVariant<[
877  SchedVar<A57LMAddrPred1,  A57VLDMOpsListUncond_Upd.Writes[0-1]>,
878  SchedVar<A57LMAddrPred2,  A57VLDMOpsListUncond_Upd.Writes[0-3]>,
879  SchedVar<A57LMAddrPred3,  A57VLDMOpsListUncond_Upd.Writes[0-5]>,
880  SchedVar<A57LMAddrPred4,  A57VLDMOpsListUncond_Upd.Writes[0-7]>,
881  SchedVar<A57LMAddrPred5,  A57VLDMOpsListUncond_Upd.Writes[0-9]>,
882  SchedVar<A57LMAddrPred6,  A57VLDMOpsListUncond_Upd.Writes[0-11]>,
883  SchedVar<A57LMAddrPred7,  A57VLDMOpsListUncond_Upd.Writes[0-13]>,
884  SchedVar<A57LMAddrPred8,  A57VLDMOpsListUncond_Upd.Writes[0-15]>,
885  SchedVar<NoSchedPred,     A57VLDMOpsListUncond_Upd.Writes[0-15]>
886]> { let Variadic=1; }
887
888def A57VLDMOpsListCond_Upd : A57WriteLMOpsListType<
889               [A57Write_5cyc_1L_1I, A57Write_6cyc_1L_1I,
890                A57Write_7cyc_1L_1I, A57Write_8cyc_1L_1I,
891                A57Write_9cyc_1L_1I, A57Write_10cyc_1L_1I,
892                A57Write_11cyc_1L_1I, A57Write_12cyc_1L_1I,
893                A57Write_13cyc_1L_1I, A57Write_14cyc_1L_1I,
894                A57Write_15cyc_1L_1I, A57Write_16cyc_1L_1I,
895                A57Write_17cyc_1L_1I, A57Write_18cyc_1L_1I,
896                A57Write_19cyc_1L_1I, A57Write_20cyc_1L_1I]>;
897def A57WriteVLDMcond_UPD : SchedWriteVariant<[
898  SchedVar<A57LMAddrPred1,  A57VLDMOpsListCond_Upd.Writes[0-1]>,
899  SchedVar<A57LMAddrPred2,  A57VLDMOpsListCond_Upd.Writes[0-3]>,
900  SchedVar<A57LMAddrPred3,  A57VLDMOpsListCond_Upd.Writes[0-5]>,
901  SchedVar<A57LMAddrPred4,  A57VLDMOpsListCond_Upd.Writes[0-7]>,
902  SchedVar<A57LMAddrPred5,  A57VLDMOpsListCond_Upd.Writes[0-9]>,
903  SchedVar<A57LMAddrPred6,  A57VLDMOpsListCond_Upd.Writes[0-11]>,
904  SchedVar<A57LMAddrPred7,  A57VLDMOpsListCond_Upd.Writes[0-13]>,
905  SchedVar<A57LMAddrPred8,  A57VLDMOpsListCond_Upd.Writes[0-15]>,
906  SchedVar<NoSchedPred,     A57VLDMOpsListCond_Upd.Writes[0-15]>
907]> { let Variadic=1; }
908
909def A57WriteVLDM_UPD : SchedWriteVariant<[
910  SchedVar<IsPredicatedPred, [A57WriteVLDMcond_UPD]>,
911  SchedVar<NoSchedPred,      [A57WriteVLDMuncond_UPD]>
912]> { let Variadic=1; }
913
914def : InstRW<[A57WrBackOne, A57WriteVLDM_UPD],
915  (instregex "VLDM(DIA_UPD|DDB_UPD|SIA_UPD|SDB_UPD)")>;
916
917// --- 3.13 FP Store Instructions ---
918def : InstRW<[A57Write_1cyc_1S], (instregex "VSTR(D|S|H)")>;
919
920def : InstRW<[A57Write_2cyc_1S], (instregex "VSTMQIA$")>;
921
922def A57WriteVSTMs : SchedWriteVariant<[
923    SchedVar<A57LMAddrPred1, [A57Write_1cyc_1S]>,
924    SchedVar<A57LMAddrPred2, [A57Write_2cyc_1S]>,
925    SchedVar<A57LMAddrPred3, [A57Write_3cyc_1S]>,
926    SchedVar<A57LMAddrPred4, [A57Write_4cyc_1S]>,
927    SchedVar<A57LMAddrPred5, [A57Write_5cyc_1S]>,
928    SchedVar<A57LMAddrPred6, [A57Write_6cyc_1S]>,
929    SchedVar<A57LMAddrPred7, [A57Write_7cyc_1S]>,
930    SchedVar<A57LMAddrPred8, [A57Write_8cyc_1S]>,
931    SchedVar<NoSchedPred,    [A57Write_2cyc_1S]>
932]>;
933def A57WriteVSTMd : SchedWriteVariant<[
934    SchedVar<A57LMAddrPred1, [A57Write_2cyc_1S]>,
935    SchedVar<A57LMAddrPred2, [A57Write_4cyc_1S]>,
936    SchedVar<A57LMAddrPred3, [A57Write_6cyc_1S]>,
937    SchedVar<A57LMAddrPred4, [A57Write_8cyc_1S]>,
938    SchedVar<A57LMAddrPred5, [A57Write_10cyc_1S]>,
939    SchedVar<A57LMAddrPred6, [A57Write_12cyc_1S]>,
940    SchedVar<A57LMAddrPred7, [A57Write_14cyc_1S]>,
941    SchedVar<A57LMAddrPred8, [A57Write_16cyc_1S]>,
942    SchedVar<NoSchedPred,    [A57Write_4cyc_1S]>
943]>;
944def A57WriteVSTMs_Upd : SchedWriteVariant<[
945    SchedVar<A57LMAddrPred1, [A57Write_1cyc_1S_1I]>,
946    SchedVar<A57LMAddrPred2, [A57Write_2cyc_1S_1I]>,
947    SchedVar<A57LMAddrPred3, [A57Write_3cyc_1S_1I]>,
948    SchedVar<A57LMAddrPred4, [A57Write_4cyc_1S_1I]>,
949    SchedVar<A57LMAddrPred5, [A57Write_5cyc_1S_1I]>,
950    SchedVar<A57LMAddrPred6, [A57Write_6cyc_1S_1I]>,
951    SchedVar<A57LMAddrPred7, [A57Write_7cyc_1S_1I]>,
952    SchedVar<A57LMAddrPred8, [A57Write_8cyc_1S_1I]>,
953    SchedVar<NoSchedPred,    [A57Write_2cyc_1S_1I]>
954]>;
955def A57WriteVSTMd_Upd : SchedWriteVariant<[
956    SchedVar<A57LMAddrPred1, [A57Write_2cyc_1S_1I]>,
957    SchedVar<A57LMAddrPred2, [A57Write_4cyc_1S_1I]>,
958    SchedVar<A57LMAddrPred3, [A57Write_6cyc_1S_1I]>,
959    SchedVar<A57LMAddrPred4, [A57Write_8cyc_1S_1I]>,
960    SchedVar<A57LMAddrPred5, [A57Write_10cyc_1S_1I]>,
961    SchedVar<A57LMAddrPred6, [A57Write_12cyc_1S_1I]>,
962    SchedVar<A57LMAddrPred7, [A57Write_14cyc_1S_1I]>,
963    SchedVar<A57LMAddrPred8, [A57Write_16cyc_1S_1I]>,
964    SchedVar<NoSchedPred,    [A57Write_2cyc_1S_1I]>
965]>;
966
967def : InstRW<[A57WriteVSTMs], (instregex "VSTMSIA$")>;
968def : InstRW<[A57WriteVSTMd], (instregex "VSTMDIA$")>;
969def : InstRW<[A57WrBackOne, A57WriteVSTMs_Upd],
970  (instregex "VSTM(SIA_UPD|SDB_UPD)")>;
971def : InstRW<[A57WrBackOne, A57WriteVSTMd_Upd],
972  (instregex "VSTM(DIA_UPD|DDB_UPD)")>;
973
974// --- 3.14 ASIMD Integer Instructions ---
975
976// ASIMD absolute diff, 3cyc F0/F1 for integer VABD
977def : InstRW<[A57Write_3cyc_1V], (instregex "VABD(s|u)")>;
978
979// ASIMD absolute diff accum: 4(1) F1 for D-form, 5(2) F1 for Q-form
980def A57WriteVABAD : SchedWriteRes<[A57UnitX]> { let Latency = 4; }
981def A57ReadVABAD  : SchedReadAdvance<3, [A57WriteVABAD]>;
982def : InstRW<[A57WriteVABAD, A57ReadVABAD],
983  (instregex "VABA(s|u)(v8i8|v4i16|v2i32)")>;
984def A57WriteVABAQ : SchedWriteRes<[A57UnitX]> { let Latency = 5; }
985def A57ReadVABAQ  : SchedReadAdvance<3, [A57WriteVABAQ]>;
986def : InstRW<[A57WriteVABAQ, A57ReadVABAQ],
987  (instregex "VABA(s|u)(v16i8|v8i16|v4i32)")>;
988
989// ASIMD absolute diff accum long: 4(1) F1 for VABAL
990def A57WriteVABAL : SchedWriteRes<[A57UnitX]> { let Latency = 4; }
991def A57ReadVABAL  : SchedReadAdvance<3, [A57WriteVABAL]>;
992def : InstRW<[A57WriteVABAL, A57ReadVABAL], (instregex "VABAL(s|u)")>;
993
994// ASIMD absolute diff long: 3cyc F0/F1 for VABDL
995def : InstRW<[A57Write_3cyc_1V], (instregex "VABDL(s|u)")>;
996
997// ASIMD arith, basic
998def : InstRW<[A57Write_3cyc_1V], (instregex "VADDv", "VADDL", "VADDW",
999  "VNEG(s8d|s16d|s32d|s8q|s16q|s32q|d|q)",
1000  "VPADDi", "VPADDL", "VSUBv", "VSUBL", "VSUBW")>;
1001
1002// ASIMD arith, complex
1003def : InstRW<[A57Write_3cyc_1V], (instregex "VABS", "VADDHN", "VHADD", "VHSUB",
1004  "VQABS", "VQADD", "VQNEG", "VQSUB",
1005  "VRADDHN", "VRHADD", "VRSUBHN", "VSUBHN")>;
1006
1007// ASIMD compare
1008def : InstRW<[A57Write_3cyc_1V],
1009  (instregex "VCEQ", "VCGE", "VCGT", "VCLE", "VTST", "VCLT")>;
1010
1011// ASIMD logical
1012def : InstRW<[A57Write_3cyc_1V],
1013  (instregex "VAND", "VBIC", "VMVN", "VORR", "VORN", "VEOR")>;
1014
1015// ASIMD max/min
1016def : InstRW<[A57Write_3cyc_1V],
1017  (instregex "(VMAX|VMIN)(s|u)", "(VPMAX|VPMIN)(s8|s16|s32|u8|u16|u32)")>;
1018
1019// ASIMD multiply, D-form: 5cyc F0 for r0px, 4cyc F0 for r1p0 and later
1020// Cortex-A57 r1p0 and later reduce the latency of ASIMD multiply
1021// and multiply-with-accumulate instructions relative to r0pX.
1022def A57WriteVMULD_VecInt : SchedWriteVariant<[
1023  SchedVar<IsR1P0AndLaterPred, [A57Write_4cyc_1W]>,
1024  SchedVar<NoSchedPred,        [A57Write_5cyc_1W]>]>;
1025def : InstRW<[A57WriteVMULD_VecInt], (instregex
1026  "VMUL(v8i8|v4i16|v2i32|pd)", "VMULsl(v4i16|v2i32)",
1027  "VQDMULH(sl)?(v4i16|v2i32)", "VQRDMULH(sl)?(v4i16|v2i32)")>;
1028
1029// ASIMD multiply, Q-form: 6cyc F0 for r0px, 5cyc F0 for r1p0 and later
1030def A57WriteVMULQ_VecInt : SchedWriteVariant<[
1031  SchedVar<IsR1P0AndLaterPred, [A57Write_5cyc_1W]>,
1032  SchedVar<NoSchedPred,        [A57Write_6cyc_1W]>]>;
1033def : InstRW<[A57WriteVMULQ_VecInt], (instregex
1034  "VMUL(v16i8|v8i16|v4i32|pq)", "VMULsl(v8i16|v4i32)",
1035  "VQDMULH(sl)?(v8i16|v4i32)", "VQRDMULH(sl)?(v8i16|v4i32)")>;
1036
1037// ASIMD multiply accumulate, D-form
1038// 5cyc F0 for r0px, 4cyc F0 for r1p0 and later, 1cyc for accumulate sequence
1039// (4 or 3 ReadAdvance)
1040def A57WriteVMLAD_VecInt : SchedWriteVariant<[
1041  SchedVar<IsR1P0AndLaterPred, [A57Write_4cyc_1W]>,
1042  SchedVar<NoSchedPred,        [A57Write_5cyc_1W]>]>;
1043def A57ReadVMLAD_VecInt : SchedReadVariant<[
1044  SchedVar<IsR1P0AndLaterPred, [SchedReadAdvance<3, [A57WriteVMLAD_VecInt]>]>,
1045  SchedVar<NoSchedPred,        [SchedReadAdvance<4, [A57WriteVMLAD_VecInt]>]>
1046]>;
1047def : InstRW<[A57WriteVMLAD_VecInt, A57ReadVMLAD_VecInt],
1048  (instregex "VMLA(sl)?(v8i8|v4i16|v2i32)", "VMLS(sl)?(v8i8|v4i16|v2i32)")>;
1049
1050// ASIMD multiply accumulate, Q-form
1051// 6cyc F0 for r0px, 5cyc F0 for r1p0 and later, 2cyc for accumulate sequence
1052// (4 or 3 ReadAdvance)
1053def A57WriteVMLAQ_VecInt : SchedWriteVariant<[
1054  SchedVar<IsR1P0AndLaterPred, [A57Write_5cyc_1W]>,
1055  SchedVar<NoSchedPred,        [A57Write_6cyc_1W]>]>;
1056def A57ReadVMLAQ_VecInt : SchedReadVariant<[
1057  SchedVar<IsR1P0AndLaterPred, [SchedReadAdvance<3, [A57WriteVMLAQ_VecInt]>]>,
1058  SchedVar<NoSchedPred,        [SchedReadAdvance<4, [A57WriteVMLAQ_VecInt]>]>
1059]>;
1060def : InstRW<[A57WriteVMLAQ_VecInt, A57ReadVMLAQ_VecInt],
1061  (instregex "VMLA(sl)?(v16i8|v8i16|v4i32)", "VMLS(sl)?(v16i8|v8i16|v4i32)")>;
1062
1063// ASIMD multiply accumulate long
1064// 5cyc F0 for r0px, 4cyc F0 for r1p0 and later, 1cyc for accumulate sequence
1065// (4 or 3 ReadAdvance)
1066def A57WriteVMLAL_VecInt : SchedWriteVariant<[
1067  SchedVar<IsR1P0AndLaterPred, [A57Write_4cyc_1W]>,
1068  SchedVar<NoSchedPred,        [A57Write_5cyc_1W]>]>;
1069def A57ReadVMLAL_VecInt : SchedReadVariant<[
1070  SchedVar<IsR1P0AndLaterPred, [SchedReadAdvance<3, [A57WriteVMLAL_VecInt]>]>,
1071  SchedVar<NoSchedPred,        [SchedReadAdvance<4, [A57WriteVMLAL_VecInt]>]>
1072]>;
1073def : InstRW<[A57WriteVMLAL_VecInt, A57ReadVMLAL_VecInt],
1074  (instregex "VMLAL(s|u)", "VMLSL(s|u)")>;
1075
1076// ASIMD multiply accumulate saturating long
1077// 5cyc F0 for r0px, 4cyc F0 for r1p0 and later, 2cyc for accumulate sequence
1078// (3 or 2 ReadAdvance)
1079def A57WriteVQDMLAL_VecInt : SchedWriteVariant<[
1080  SchedVar<IsR1P0AndLaterPred, [A57Write_4cyc_1W]>,
1081  SchedVar<NoSchedPred,        [A57Write_5cyc_1W]>]>;
1082def A57ReadVQDMLAL_VecInt : SchedReadVariant<[
1083  SchedVar<IsR1P0AndLaterPred, [SchedReadAdvance<2, [A57WriteVQDMLAL_VecInt]>]>,
1084  SchedVar<NoSchedPred,        [SchedReadAdvance<3, [A57WriteVQDMLAL_VecInt]>]>
1085]>;
1086def : InstRW<[A57WriteVQDMLAL_VecInt, A57ReadVQDMLAL_VecInt],
1087  (instregex "VQDMLAL", "VQDMLSL")>;
1088
1089// Vector Saturating Rounding Doubling Multiply Accumulate/Subtract Long
1090// Scheduling info from VQDMLAL/VQDMLSL
1091def : InstRW<[A57WriteVQDMLAL_VecInt, A57ReadVQDMLAL_VecInt],
1092  (instregex "VQRDMLAH", "VQRDMLSH")>;
1093
1094// ASIMD multiply long
1095// 5cyc F0 for r0px, 4cyc F0 for r1p0 and later
1096def A57WriteVMULL_VecInt : SchedWriteVariant<[
1097  SchedVar<IsR1P0AndLaterPred, [A57Write_4cyc_1W]>,
1098  SchedVar<NoSchedPred,        [A57Write_5cyc_1W]>]>;
1099def : InstRW<[A57WriteVMULL_VecInt],
1100  (instregex "VMULL(s|u|p8|sls|slu)", "VQDMULL")>;
1101
1102// ASIMD pairwise add and accumulate
1103// 4cyc F1, 1cyc for accumulate sequence (3cyc ReadAdvance)
1104def A57WriteVPADAL : SchedWriteRes<[A57UnitX]> { let Latency = 4; }
1105def A57ReadVPADAL  : SchedReadAdvance<3, [A57WriteVPADAL]>;
1106def : InstRW<[A57WriteVPADAL, A57ReadVPADAL], (instregex "VPADAL(s|u)")>;
1107
1108// ASIMD shift accumulate
1109// 4cyc F1, 1cyc for accumulate sequence (3cyc ReadAdvance)
1110def A57WriteVSRA : SchedWriteRes<[A57UnitX]> { let Latency = 4;  }
1111def A57ReadVSRA  : SchedReadAdvance<3, [A57WriteVSRA]>;
1112def : InstRW<[A57WriteVSRA, A57ReadVSRA], (instregex "VSRA", "VRSRA")>;
1113
1114// ASIMD shift by immed, basic
1115def : InstRW<[A57Write_3cyc_1X],
1116  (instregex "VMOVL", "VSHLi", "VSHLL", "VSHR(s|u)", "VSHRN")>;
1117
1118// ASIMD shift by immed, complex
1119def : InstRW<[A57Write_4cyc_1X], (instregex
1120  "VQRSHRN", "VQRSHRUN", "VQSHL(si|ui|su)", "VQSHRN", "VQSHRUN", "VRSHR(s|u)",
1121  "VRSHRN")>;
1122
1123// ASIMD shift by immed and insert, basic, D-form
1124def : InstRW<[A57Write_4cyc_1X], (instregex
1125  "VSLI(v8i8|v4i16|v2i32|v1i64)", "VSRI(v8i8|v4i16|v2i32|v1i64)")>;
1126
1127// ASIMD shift by immed and insert, basic, Q-form
1128def : InstRW<[A57Write_5cyc_1X], (instregex
1129  "VSLI(v16i8|v8i16|v4i32|v2i64)", "VSRI(v16i8|v8i16|v4i32|v2i64)")>;
1130
1131// ASIMD shift by register, basic, D-form
1132def : InstRW<[A57Write_3cyc_1X], (instregex
1133  "VSHL(s|u)(v8i8|v4i16|v2i32|v1i64)")>;
1134
1135// ASIMD shift by register, basic, Q-form
1136def : InstRW<[A57Write_4cyc_1X], (instregex
1137  "VSHL(s|u)(v16i8|v8i16|v4i32|v2i64)")>;
1138
1139// ASIMD shift by register, complex, D-form
1140// VQRSHL, VQSHL, VRSHL
1141def : InstRW<[A57Write_4cyc_1X], (instregex
1142  "VQRSHL(s|u)(v8i8|v4i16|v2i32|v1i64)", "VQSHL(s|u)(v8i8|v4i16|v2i32|v1i64)",
1143  "VRSHL(s|u)(v8i8|v4i16|v2i32|v1i64)")>;
1144
1145// ASIMD shift by register, complex, Q-form
1146def : InstRW<[A57Write_5cyc_1X], (instregex
1147  "VQRSHL(s|u)(v16i8|v8i16|v4i32|v2i64)", "VQSHL(s|u)(v16i8|v8i16|v4i32|v2i64)",
1148  "VRSHL(s|u)(v16i8|v8i16|v4i32|v2i64)")>;
1149
1150// --- 3.15 ASIMD Floating-Point Instructions ---
1151// ASIMD FP absolute value
1152def : InstRW<[A57Write_3cyc_1V], (instregex "VABS(fd|fq|hd|hq)")>;
1153
1154// ASIMD FP arith
1155def : InstRW<[A57Write_5cyc_1V], (instregex "VABD(fd|fq|hd|hq)",
1156  "VADD(fd|fq|hd|hq)", "VPADD(f|h)", "VSUB(fd|fq|hd|hq)")>;
1157
1158def : InstRW<[A57Write_5cyc_1V], (instregex "VCADD", "VCMLA")>;
1159
1160// ASIMD FP compare
1161def : InstRW<[A57Write_5cyc_1V], (instregex "VAC(GE|GT|LE|LT)",
1162  "VC(EQ|GE|GT|LE)(fd|fq|hd|hq)")>;
1163
1164// ASIMD FP convert, integer
1165def : InstRW<[A57Write_5cyc_1V], (instregex
1166  "VCVT(f2sd|f2ud|s2fd|u2fd|f2sq|f2uq|s2fq|u2fq|f2xsd|f2xud|xs2fd|xu2fd)",
1167  "VCVT(f2xsq|f2xuq|xs2fq|xu2fq)",
1168  "VCVT(AN|MN|NN|PN)(SDf|SQf|UDf|UQf|SDh|SQh|UDh|UQh)")>;
1169
1170// ASIMD FP convert, half-precision: 8cyc F0/F1
1171def : InstRW<[A57Write_8cyc_1V], (instregex
1172  "VCVT(h2sd|h2ud|s2hd|u2hd|h2sq|h2uq|s2hq|u2hq|h2xsd|h2xud|xs2hd|xu2hd)",
1173  "VCVT(h2xsq|h2xuq|xs2hq|xu2hq)",
1174  "VCVT(f2h|h2f)")>;
1175
1176// ASIMD FP max/min
1177def : InstRW<[A57Write_5cyc_1V], (instregex
1178  "(VMAX|VMIN)(fd|fq|hd|hq)", "(VPMAX|VPMIN)(f|h)", "VMAXNM", "VMINNM")>;
1179
1180// ASIMD FP multiply
1181def A57WriteVMUL_VecFP  : SchedWriteRes<[A57UnitV]> { let Latency = 5;  }
1182def : InstRW<[A57WriteVMUL_VecFP], (instregex "VMUL(sl)?(fd|fq|hd|hq)")>;
1183
1184// ASIMD FP multiply accumulate: 9cyc F0/F1, 4cyc for accumulate sequence
1185def A57WriteVMLA_VecFP  : SchedWriteRes<[A57UnitV]> { let Latency = 9;  }
1186def A57ReadVMLA_VecFP  :
1187  SchedReadAdvance<5, [A57WriteVMLA_VecFP, A57WriteVMUL_VecFP]>;
1188def : InstRW<[A57WriteVMLA_VecFP, A57ReadVMLA_VecFP],
1189  (instregex "(VMLA|VMLS)(sl)?(fd|fq|hd|hq)", "(VFMA|VFMS)(fd|fq|hd|hq)")>;
1190
1191// ASIMD FP negate
1192def : InstRW<[A57Write_3cyc_1V], (instregex "VNEG(fd|f32q|hd|hq)")>;
1193
1194// ASIMD FP round to integral
1195def : InstRW<[A57Write_5cyc_1V], (instregex
1196  "VRINT(AN|MN|NN|PN|XN|ZN)(Df|Qf|Dh|Qh)")>;
1197
1198// --- 3.16 ASIMD Miscellaneous Instructions ---
1199
1200// ASIMD bitwise insert
1201def : InstRW<[A57Write_3cyc_1V], (instregex "VBIF", "VBIT", "VBSL")>;
1202
1203// ASIMD count
1204def : InstRW<[A57Write_3cyc_1V], (instregex "VCLS", "VCLZ", "VCNT")>;
1205
1206// ASIMD duplicate, core reg: 8cyc "L, F0/F1"
1207def : InstRW<[A57Write_8cyc_1L_1V], (instregex "VDUP(8|16|32)(d|q)")>;
1208
1209// ASIMD duplicate, scalar: 3cyc "F0/F1"
1210def : InstRW<[A57Write_3cyc_1V], (instregex "VDUPLN(8|16|32)(d|q)")>;
1211
1212// ASIMD extract
1213def : InstRW<[A57Write_3cyc_1V], (instregex "VEXT(d|q)(8|16|32|64)")>;
1214
1215// ASIMD move, immed
1216def : InstRW<[A57Write_3cyc_1V], (instregex
1217  "VMOV(v8i8|v16i8|v4i16|v8i16|v2i32|v4i32|v1i64|v2i64|v2f32|v4f32)",
1218  "VMOVD0", "VMOVQ0")>;
1219
1220// ASIMD move, narrowing
1221def : InstRW<[A57Write_3cyc_1V], (instregex "VMOVN")>;
1222
1223// ASIMD move, saturating
1224def : InstRW<[A57Write_4cyc_1X], (instregex "VQMOVN")>;
1225
1226// ASIMD reciprocal estimate
1227def : InstRW<[A57Write_5cyc_1V], (instregex "VRECPE", "VRSQRTE")>;
1228
1229// ASIMD reciprocal step, FZ
1230def : InstRW<[A57Write_9cyc_1V], (instregex "VRECPS", "VRSQRTS")>;
1231
1232// ASIMD reverse, swap, table lookup (1-2 reg)
1233def : InstRW<[A57Write_3cyc_1V], (instregex "VREV", "VSWP", "VTB(L|X)(1|2)")>;
1234
1235// ASIMD table lookup (3-4 reg)
1236def : InstRW<[A57Write_6cyc_1V], (instregex "VTBL(3|4)", "VTBX(3|4)")>;
1237
1238// ASIMD transfer, scalar to core reg: 6cyc "L, I0/I1"
1239def : InstRW<[A57Write_6cyc_1L_1I], (instregex "VGETLN")>;
1240
1241// ASIMD transfer, core reg to scalar: 8cyc "L, F0/F1"
1242def : InstRW<[A57Write_8cyc_1L_1V], (instregex "VSETLN")>;
1243
1244// ASIMD transpose
1245def : InstRW<[A57Write_3cyc_1V, A57Write_3cyc_1V], (instregex "VTRN")>;
1246
1247// ASIMD unzip/zip, D-form
1248def : InstRW<[A57Write_3cyc_1V, A57Write_3cyc_1V],
1249  (instregex "VUZPd", "VZIPd")>;
1250
1251// ASIMD unzip/zip, Q-form
1252def : InstRW<[A57Write_6cyc_1V, A57Write_6cyc_1V],
1253  (instregex "VUZPq", "VZIPq")>;
1254
1255// --- 3.17 ASIMD Load Instructions ---
1256
1257// Overriden via InstRW for this processor.
1258def : WriteRes<WriteVLD1, []>;
1259def : WriteRes<WriteVLD2, []>;
1260def : WriteRes<WriteVLD3, []>;
1261def : WriteRes<WriteVLD4, []>;
1262def : WriteRes<WriteVST1, []>;
1263def : WriteRes<WriteVST2, []>;
1264def : WriteRes<WriteVST3, []>;
1265def : WriteRes<WriteVST4, []>;
1266
1267// 1-2 reg: 5cyc L, +I for writeback, 1 cyc wb latency
1268def : InstRW<[A57Write_5cyc_1L], (instregex "VLD1(d|q)(8|16|32|64)$")>;
1269def : InstRW<[A57Write_5cyc_1L_1I, A57WrBackOne],
1270  (instregex "VLD1(d|q)(8|16|32|64)wb")>;
1271
1272// 3-4 reg: 6cyc L, +I for writeback, 1 cyc wb latency
1273def : InstRW<[A57Write_6cyc_1L],
1274  (instregex "VLD1(d|q)(8|16|32|64)(T|Q)$", "VLD1d64(T|Q)Pseudo")>;
1275
1276def : InstRW<[A57Write_6cyc_1L_1I, A57WrBackOne],
1277  (instregex "VLD1(d|q)(8|16|32|64)(T|Q)wb")>;
1278
1279// ASIMD load, 1 element, one lane and all lanes: 8cyc "L, F0/F1"
1280def : InstRW<[A57Write_8cyc_1L_1V], (instregex
1281  "VLD1(LN|DUP)(d|q)(8|16|32)$", "VLD1(LN|DUP)(d|q)(8|16|32)Pseudo$")>;
1282def : InstRW<[A57Write_8cyc_1L_1V_1I, A57WrBackOne], (instregex
1283  "VLD1(LN|DUP)(d|q)(8|16|32)(wb|_UPD)", "VLD1LNq(8|16|32)Pseudo_UPD")>;
1284
1285// ASIMD load, 2 element, multiple, 2 reg: 8cyc "L, F0/F1"
1286def : InstRW<[A57Write_8cyc_1L_1V],
1287      (instregex "VLD2(d|q)(8|16|32)$", "VLD2q(8|16|32)Pseudo$")>;
1288def : InstRW<[A57Write_8cyc_1L_1V_1I, A57WrBackOne],
1289      (instregex "VLD2(d|q)(8|16|32)wb", "VLD2q(8|16|32)PseudoWB")>;
1290
1291// ASIMD load, 2 element, multiple, 4 reg: 9cyc "L, F0/F1"
1292def : InstRW<[A57Write_9cyc_1L_1V], (instregex "VLD2b(8|16|32)$")>;
1293def : InstRW<[A57Write_9cyc_1L_1V_1I, A57WrBackOne],
1294      (instregex "VLD2b(8|16|32)wb")>;
1295
1296// ASIMD load, 2 element, one lane and all lanes: 8cyc "L, F0/F1"
1297def : InstRW<[A57Write_8cyc_1L_1V, A57Write_8cyc_1L_1V],
1298      (instregex "VLD2(DUP|LN)(d|q)(8|16|32|8x2|16x2|32x2)$",
1299                 "VLD2LN(d|q)(8|16|32)Pseudo$")>;
1300// 2 results + wb result
1301def : InstRW<[A57Write_8cyc_1L_1V_1I, A57Write_8cyc_1L_1V, A57WrBackOne],
1302      (instregex "VLD2LN(d|q)(8|16|32)_UPD$")>;
1303// 1 result + wb result
1304def : InstRW<[A57Write_8cyc_1L_1V_1I, A57WrBackOne],
1305      (instregex "VLD2DUPd(8|16|32|8x2|16x2|32x2)wb",
1306                 "VLD2LN(d|q)(8|16|32)Pseudo_UPD")>;
1307
1308// ASIMD load, 3 element, multiple, 3 reg: 9cyc "L, F0/F1"
1309// 3 results
1310def : InstRW<[A57Write_9cyc_1L_1V, A57Write_9cyc_1L_1V, A57Write_9cyc_1L_1V],
1311      (instregex "VLD3(d|q)(8|16|32)$")>;
1312// 1 result
1313def : InstRW<[A57Write_9cyc_1L_1V],
1314      (instregex "VLD3(d|q)(8|16|32)(oddP|P)seudo$")>;
1315// 3 results + wb
1316def : InstRW<[A57Write_9cyc_1L_1V_1I, A57Write_9cyc_1L_1V_1I,
1317              A57Write_9cyc_1L_1V_1I, A57WrBackOne],
1318      (instregex "VLD3(d|q)(8|16|32)_UPD$")>;
1319// 1 result + wb
1320def : InstRW<[A57Write_9cyc_1L_1V_1I, A57WrBackOne],
1321      (instregex "VLD3(d|q)(8|16|32)(oddP|P)seudo_UPD")>;
1322
1323// ASIMD load, 3 element, one lane, size 32: 8cyc "L, F0/F1"
1324def : InstRW<[A57Write_8cyc_1L_1V, A57Write_8cyc_1L_1V, A57Write_8cyc_1L_1V],
1325      (instregex "VLD3LN(d|q)32$",
1326                 "VLD3LN(d|q)32Pseudo$")>;
1327def : InstRW<[A57Write_8cyc_1L_1V_1I, A57Write_8cyc_1L_1V_1I,
1328              A57Write_8cyc_1L_1V_1I, A57WrBackOne],
1329      (instregex "VLD3LN(d|q)32_UPD")>;
1330def : InstRW<[A57Write_8cyc_1L_1V_1I, A57WrBackOne],
1331      (instregex "VLD3LN(d|q)32Pseudo_UPD")>;
1332
1333// ASIMD load, 3 element, one lane, size 8/16: 9cyc "L, F0/F1"
1334def : InstRW<[A57Write_9cyc_1L_1V, A57Write_9cyc_1L_1V, A57Write_9cyc_1L_1V],
1335      (instregex "VLD3LN(d|q)(8|16)$",
1336                 "VLD3LN(d|q)(8|16)Pseudo$")>;
1337def : InstRW<[A57Write_9cyc_1L_1V_1I, A57Write_9cyc_1L_1V_1I,
1338              A57Write_9cyc_1L_1V_1I, A57WrBackOne],
1339      (instregex "VLD3LN(d|q)(8|16)_UPD")>;
1340def : InstRW<[A57Write_9cyc_1L_1V_1I, A57WrBackOne],
1341      (instregex "VLD3LN(d|q)(8|16)Pseudo_UPD")>;
1342
1343// ASIMD load, 3 element, all lanes: 8cyc "L, F0/F1"
1344def : InstRW<[A57Write_8cyc_1L_1V, A57Write_8cyc_1L_1V, A57Write_8cyc_1L_1V],
1345      (instregex "VLD3DUP(d|q)(8|16|32)$",
1346                 "VLD3DUP(d|q)(8|16|32)Pseudo$")>;
1347def : InstRW<[A57Write_8cyc_1L_1V_1I, A57Write_8cyc_1L_1V_1I,
1348              A57Write_8cyc_1L_1V_1I, A57WrBackOne],
1349      (instregex "VLD3DUP(d|q)(8|16|32)_UPD")>;
1350def : InstRW<[A57Write_8cyc_1L_1V_1I, A57WrBackOne],
1351      (instregex "VLD3DUP(d|q)(8|16|32)Pseudo_UPD")>;
1352
1353// ASIMD load, 4 element, multiple, 4 reg: 9cyc "L, F0/F1"
1354def : InstRW<[A57Write_9cyc_1L_1V, A57Write_9cyc_1L_1V, A57Write_9cyc_1L_1V,
1355              A57Write_9cyc_1L_1V],
1356      (instregex "VLD4(d|q)(8|16|32)$")>;
1357def : InstRW<[A57Write_9cyc_1L_1V],
1358      (instregex "VLD4(d|q)(8|16|32)(oddP|P)seudo$")>;
1359def : InstRW<[A57Write_9cyc_1L_1V_1I, A57Write_9cyc_1L_1V_1I,
1360              A57Write_9cyc_1L_1V_1I, A57Write_9cyc_1L_1V_1I, A57WrBackOne],
1361      (instregex "VLD4(d|q)(8|16|32)_UPD")>;
1362def : InstRW<[A57Write_9cyc_1L_1V_1I, A57WrBackOne],
1363      (instregex  "VLD4(d|q)(8|16|32)(oddP|P)seudo_UPD")>;
1364
1365// ASIMD load, 4 element, one lane, size 32: 8cyc "L, F0/F1"
1366def : InstRW<[A57Write_8cyc_1L_1V, A57Write_8cyc_1L_1V, A57Write_8cyc_1L_1V,
1367              A57Write_8cyc_1L_1V],
1368      (instregex "VLD4LN(d|q)32$",
1369                 "VLD4LN(d|q)32Pseudo$")>;
1370def : InstRW<[A57Write_8cyc_1L_1V_1I, A57Write_8cyc_1L_1V_1I,
1371              A57Write_8cyc_1L_1V_1I, A57Write_8cyc_1L_1V_1I,
1372              A57WrBackOne],
1373      (instregex "VLD4LN(d|q)32_UPD")>;
1374def : InstRW<[A57Write_8cyc_1L_1V_1I, A57WrBackOne],
1375      (instregex "VLD4LN(d|q)32Pseudo_UPD")>;
1376
1377// ASIMD load, 4 element, one lane, size 8/16: 9cyc "L, F0/F1"
1378def : InstRW<[A57Write_9cyc_1L_1V, A57Write_9cyc_1L_1V, A57Write_9cyc_1L_1V,
1379              A57Write_9cyc_1L_1V],
1380      (instregex "VLD4LN(d|q)(8|16)$",
1381                 "VLD4LN(d|q)(8|16)Pseudo$")>;
1382def : InstRW<[A57Write_9cyc_1L_1V_1I, A57Write_9cyc_1L_1V_1I,
1383              A57Write_9cyc_1L_1V_1I, A57Write_9cyc_1L_1V_1I,
1384              A57WrBackOne],
1385      (instregex "VLD4LN(d|q)(8|16)_UPD")>;
1386def : InstRW<[A57Write_9cyc_1L_1V_1I, A57WrBackOne],
1387      (instregex "VLD4LN(d|q)(8|16)Pseudo_UPD")>;
1388
1389// ASIMD load, 4 element, all lanes: 8cyc "L, F0/F1"
1390def : InstRW<[A57Write_8cyc_1L_1V, A57Write_8cyc_1L_1V, A57Write_8cyc_1L_1V,
1391              A57Write_8cyc_1L_1V],
1392      (instregex "VLD4DUP(d|q)(8|16|32)$",
1393                 "VLD4DUP(d|q)(8|16|32)Pseudo$")>;
1394def : InstRW<[A57Write_8cyc_1L_1V_1I, A57Write_8cyc_1L_1V_1I,
1395              A57Write_8cyc_1L_1V_1I, A57Write_8cyc_1L_1V_1I,
1396              A57WrBackOne],
1397      (instregex "VLD4DUP(d|q)(8|16|32)_UPD")>;
1398def : InstRW<[A57Write_8cyc_1L_1V_1I, A57WrBackOne],
1399      (instregex "VLD4DUP(d|q)(8|16|32)Pseudo_UPD")>;
1400
1401// --- 3.18 ASIMD Store Instructions ---
1402
1403// ASIMD store, 1 element, multiple, 1 reg: 1cyc S
1404def : InstRW<[A57Write_1cyc_1S], (instregex "VST1d(8|16|32|64)$")>;
1405def : InstRW<[A57WrBackOne, A57Write_1cyc_1S_1I],
1406      (instregex "VST1d(8|16|32|64)wb")>;
1407// ASIMD store, 1 element, multiple, 2 reg: 2cyc S
1408def : InstRW<[A57Write_2cyc_1S], (instregex "VST1q(8|16|32|64)$")>;
1409def : InstRW<[A57WrBackOne, A57Write_2cyc_1S_1I],
1410      (instregex "VST1q(8|16|32|64)wb")>;
1411// ASIMD store, 1 element, multiple, 3 reg: 3cyc S
1412def : InstRW<[A57Write_3cyc_1S],
1413      (instregex "VST1d(8|16|32|64)T$", "VST1d64TPseudo$")>;
1414def : InstRW<[A57WrBackOne, A57Write_3cyc_1S_1I],
1415      (instregex "VST1d(8|16|32|64)Twb", "VST1d64TPseudoWB")>;
1416// ASIMD store, 1 element, multiple, 4 reg: 4cyc S
1417def : InstRW<[A57Write_4cyc_1S],
1418      (instregex "VST1d(8|16|32|64)(Q|QPseudo)$")>;
1419def : InstRW<[A57WrBackOne, A57Write_4cyc_1S_1I],
1420      (instregex "VST1d(8|16|32|64)(Qwb|QPseudoWB)")>;
1421// ASIMD store, 1 element, one lane: 3cyc "F0/F1, S"
1422def : InstRW<[A57Write_3cyc_1S_1V],
1423      (instregex "VST1LNd(8|16|32)$", "VST1LNq(8|16|32)Pseudo$")>;
1424def : InstRW<[A57WrBackOne, A57Write_3cyc_1S_1V_1I],
1425      (instregex "VST1LNd(8|16|32)_UPD", "VST1LNq(8|16|32)Pseudo_UPD")>;
1426// ASIMD store, 2 element, multiple, 2 reg: 3cyc "F0/F1, S"
1427def : InstRW<[A57Write_3cyc_1S_1V],
1428      (instregex "VST2(d|b)(8|16|32)$")>;
1429def : InstRW<[A57WrBackOne, A57Write_3cyc_1S_1V_1I],
1430      (instregex "VST2(b|d)(8|16|32)wb")>;
1431// ASIMD store, 2 element, multiple, 4 reg: 4cyc "F0/F1, S"
1432def : InstRW<[A57Write_4cyc_1S_1V],
1433      (instregex "VST2q(8|16|32)$", "VST2q(8|16|32)Pseudo$")>;
1434def : InstRW<[A57WrBackOne, A57Write_4cyc_1S_1V_1I],
1435      (instregex "VST2q(8|16|32)wb", "VST2q(8|16|32)PseudoWB")>;
1436// ASIMD store, 2 element, one lane: 3cyc "F0/F1, S"
1437def : InstRW<[A57Write_3cyc_1S_1V],
1438      (instregex "VST2LN(d|q)(8|16|32)$", "VST2LN(d|q)(8|16|32)Pseudo$")>;
1439def : InstRW<[A57WrBackOne, A57Write_3cyc_1S_1V_1I],
1440      (instregex "VST2LN(d|q)(8|16|32)_UPD",
1441                 "VST2LN(d|q)(8|16|32)Pseudo_UPD")>;
1442// ASIMD store, 3 element, multiple, 3 reg
1443def : InstRW<[A57Write_3cyc_1S_1V],
1444      (instregex "VST3(d|q)(8|16|32)$", "VST3(d|q)(8|16|32)(oddP|P)seudo$")>;
1445def : InstRW<[A57WrBackOne, A57Write_3cyc_1S_1V_1I],
1446      (instregex "VST3(d|q)(8|16|32)_UPD",
1447                 "VST3(d|q)(8|16|32)(oddP|P)seudo_UPD$")>;
1448// ASIMD store, 3 element, one lane
1449def : InstRW<[A57Write_3cyc_1S_1V],
1450      (instregex "VST3LN(d|q)(8|16|32)$", "VST3LN(d|q)(8|16|32)Pseudo$")>;
1451def : InstRW<[A57WrBackOne, A57Write_3cyc_1S_1V_1I],
1452      (instregex "VST3LN(d|q)(8|16|32)_UPD",
1453                 "VST3LN(d|q)(8|16|32)Pseudo_UPD")>;
1454// ASIMD store, 4 element, multiple, 4 reg
1455def : InstRW<[A57Write_4cyc_1S_1V],
1456      (instregex "VST4(d|q)(8|16|32)$", "VST4(d|q)(8|16|32)(oddP|P)seudo$")>;
1457def : InstRW<[A57WrBackOne, A57Write_4cyc_1S_1V_1I],
1458      (instregex "VST4(d|q)(8|16|32)_UPD",
1459                 "VST4(d|q)(8|16|32)(oddP|P)seudo_UPD$")>;
1460// ASIMD store, 4 element, one lane
1461def : InstRW<[A57Write_3cyc_1S_1V],
1462      (instregex "VST4LN(d|q)(8|16|32)$", "VST4LN(d|q)(8|16|32)Pseudo$")>;
1463def : InstRW<[A57WrBackOne, A57Write_3cyc_1S_1V_1I],
1464      (instregex "VST4LN(d|q)(8|16|32)_UPD",
1465                 "VST4LN(d|q)(8|16|32)Pseudo_UPD")>;
1466
1467// --- 3.19 Cryptography Extensions ---
1468// Crypto AES ops
1469// AESD, AESE, AESIMC, AESMC: 3cyc F0
1470def : InstRW<[A57Write_3cyc_1W], (instregex "^AES")>;
1471// Crypto polynomial (64x64) multiply long (VMULL.P64): 3cyc F0
1472def : InstRW<[A57Write_3cyc_1W], (instregex "^VMULLp64")>;
1473// Crypto SHA1 xor ops: 6cyc F0/F1
1474def : InstRW<[A57Write_6cyc_2V], (instregex "^SHA1SU0")>;
1475// Crypto SHA1 fast ops: 3cyc F0
1476def : InstRW<[A57Write_3cyc_1W], (instregex "^SHA1(H|SU1)")>;
1477// Crypto SHA1 slow ops: 6cyc F0
1478def : InstRW<[A57Write_6cyc_2W], (instregex "^SHA1[CMP]")>;
1479// Crypto SHA256 fast ops: 3cyc F0
1480def : InstRW<[A57Write_3cyc_1W], (instregex "^SHA256SU0")>;
1481// Crypto SHA256 slow ops: 6cyc F0
1482def : InstRW<[A57Write_6cyc_2W], (instregex "^SHA256(H|H2|SU1)")>;
1483
1484// --- 3.20 CRC ---
1485def : InstRW<[A57Write_3cyc_1W], (instregex "^(t2)?CRC32")>;
1486
1487// -----------------------------------------------------------------------------
1488// Common definitions
1489def : WriteRes<WriteNoop, []> { let Latency = 0; let NumMicroOps = 0; }
1490def : SchedAlias<WriteALU, A57Write_1cyc_1I>;
1491
1492def : SchedAlias<WriteBr, A57Write_1cyc_1B>;
1493def : SchedAlias<WriteBrL, A57Write_1cyc_1B_1I>;
1494def : SchedAlias<WriteBrTbl, A57Write_1cyc_1B_1I>;
1495def : SchedAlias<WritePreLd, A57Write_4cyc_1L>;
1496
1497def : SchedAlias<WriteLd, A57Write_4cyc_1L>;
1498def : SchedAlias<WriteST, A57Write_1cyc_1S>;
1499def : ReadAdvance<ReadALU, 0>;
1500
1501} // SchedModel = CortexA57Model
1502
1503