1//=- ARMScheduleA57.td - ARM Cortex-A57 Scheduling Defs -----*- tablegen -*-=//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines the machine model for ARM Cortex-A57 to support
10// instruction scheduling and other instruction cost heuristics.
11//
12//===----------------------------------------------------------------------===//
13
14//===----------------------------------------------------------------------===//
15// *** Common description and scheduling model parameters taken from AArch64 ***
16// The Cortex-A57 is a traditional superscalar microprocessor with a
17// conservative 3-wide in-order stage for decode and dispatch. Combined with the
18// much wider out-of-order issue stage, this produced a need to carefully
19// schedule micro-ops so that all three decoded each cycle are successfully
20// issued as the reservation station(s) simply don't stay occupied for long.
21// Therefore, IssueWidth is set to the narrower of the two at three, while still
22// modeling the machine as out-of-order.
23
24def IsCPSRDefinedPred : SchedPredicate<[{TII->isCPSRDefined(*MI)}]>;
25def IsCPSRDefinedAndPredicatedPred :
26  SchedPredicate<[{TII->isCPSRDefined(*MI) && TII->isPredicated(*MI)}]>;
27
28// Cortex A57 rev. r1p0 or later (false = r0px)
29def IsR1P0AndLaterPred : SchedPredicate<[{false}]>;
30
31// If Addrmode3 contains register offset (not immediate)
32def IsLdrAm3RegOffPred :
33  SchedPredicate<[{!TII->isAddrMode3OpImm(*MI, 1)}]>;
34// The same predicate with operand offset 2 and 3:
35def IsLdrAm3RegOffPredX2 :
36  SchedPredicate<[{!TII->isAddrMode3OpImm(*MI, 2)}]>;
37def IsLdrAm3RegOffPredX3 :
38  SchedPredicate<[{!TII->isAddrMode3OpImm(*MI, 3)}]>;
39
40// If Addrmode3 contains "minus register"
41def IsLdrAm3NegRegOffPred :
42  SchedPredicate<[{TII->isAddrMode3OpMinusReg(*MI, 1)}]>;
43// The same predicate with operand offset 2 and 3:
44def IsLdrAm3NegRegOffPredX2 :
45  SchedPredicate<[{TII->isAddrMode3OpMinusReg(*MI, 2)}]>;
46def IsLdrAm3NegRegOffPredX3 :
47  SchedPredicate<[{TII->isAddrMode3OpMinusReg(*MI, 3)}]>;
48
49// Load, scaled register offset, not plus LSL2
50def IsLdstsoScaledNotOptimalPredX0 :
51  SchedPredicate<[{TII->isLdstScaledRegNotPlusLsl2(*MI, 0)}]>;
52def IsLdstsoScaledNotOptimalPred :
53  SchedPredicate<[{TII->isLdstScaledRegNotPlusLsl2(*MI, 1)}]>;
54def IsLdstsoScaledNotOptimalPredX2 :
55  SchedPredicate<[{TII->isLdstScaledRegNotPlusLsl2(*MI, 2)}]>;
56
57// Load, scaled register offset
58def IsLdstsoScaledPred :
59  SchedPredicate<[{TII->isLdstScaledReg(*MI, 1)}]>;
60def IsLdstsoScaledPredX2 :
61  SchedPredicate<[{TII->isLdstScaledReg(*MI, 2)}]>;
62
63def IsLdstsoMinusRegPredX0 :
64  SchedPredicate<[{TII->isLdstSoMinusReg(*MI, 0)}]>;
65def IsLdstsoMinusRegPred :
66  SchedPredicate<[{TII->isLdstSoMinusReg(*MI, 1)}]>;
67def IsLdstsoMinusRegPredX2 :
68  SchedPredicate<[{TII->isLdstSoMinusReg(*MI, 2)}]>;
69
70// Load, scaled register offset
71def IsLdrAm2ScaledPred :
72  SchedPredicate<[{TII->isAm2ScaledReg(*MI, 1)}]>;
73
74// LDM, base reg in list
75def IsLdmBaseRegInList :
76  SchedPredicate<[{TII->isLDMBaseRegInList(*MI)}]>;
77
78class A57WriteLMOpsListType<list<SchedWriteRes> writes> {
79  list <SchedWriteRes> Writes = writes;
80  SchedMachineModel SchedModel = ?;
81}
82
83// *** Common description and scheduling model parameters taken from AArch64 ***
84// (AArch64SchedA57.td)
85def CortexA57Model : SchedMachineModel {
86  let IssueWidth        =   3; // 3-way decode and dispatch
87  let MicroOpBufferSize = 128; // 128 micro-op re-order buffer
88  let LoadLatency       =   4; // Optimistic load latency
89  let MispredictPenalty =  16; // Fetch + Decode/Rename/Dispatch + Branch
90
91  // Enable partial & runtime unrolling.
92  let LoopMicroOpBufferSize = 16;
93  let CompleteModel = 1;
94
95  // FIXME: Remove when all errors have been fixed.
96  let FullInstRWOverlapCheck = 0;
97
98  let UnsupportedFeatures = [HasV8_1MMainline, HasMVEInt, HasMVEFloat,
99                             HasFPRegsV8_1M];
100}
101
102//===----------------------------------------------------------------------===//
103// Define each kind of processor resource and number available on Cortex-A57.
104// Cortex A-57 has 8 pipelines that each has its own 8-entry queue where
105// micro-ops wait for their operands and then issue out-of-order.
106
107def A57UnitB : ProcResource<1>;  // Type B micro-ops
108def A57UnitI : ProcResource<2>;  // Type I micro-ops
109def A57UnitM : ProcResource<1>;  // Type M micro-ops
110def A57UnitL : ProcResource<1>;  // Type L micro-ops
111def A57UnitS : ProcResource<1>;  // Type S micro-ops
112
113def A57UnitX : ProcResource<1>;  // Type X micro-ops (F1)
114def A57UnitW : ProcResource<1>;  // Type W micro-ops (F0)
115
116let SchedModel = CortexA57Model in {
117  def A57UnitV : ProcResGroup<[A57UnitX, A57UnitW]>;    // Type V micro-ops
118}
119
120let SchedModel = CortexA57Model in {
121
122//===----------------------------------------------------------------------===//
123// Define customized scheduler read/write types specific to the Cortex-A57.
124
125include "ARMScheduleA57WriteRes.td"
126
127// To have "CompleteModel = 1", support of pseudos and special instructions
128def : InstRW<[WriteNoop], (instregex "(t)?BKPT$", "(t2)?CDP(2)?$",
129  "(t2)?CLREX$", "CONSTPOOL_ENTRY$", "COPY_STRUCT_BYVAL_I32$",
130  "(t2)?CPS[123]p$", "(t2)?DBG$", "(t2)?DMB$", "(t2)?DSB$", "ERET$",
131  "(t2|t)?HINT$", "(t)?HLT$", "(t2)?HVC$", "(t2)?ISB$", "ITasm$",
132  "(t2)?RFE(DA|DB|IA|IB)", "(t)?SETEND", "(t2)?SETPAN", "(t2)?SMC", "SPACE",
133  "(t2)?SRS(DA|DB|IA|IB)", "SWP(B)?", "t?TRAP", "(t2|t)?UDF$", "t2DCPS", "t2SG",
134  "t2TT", "tCPS", "CMP_SWAP", "t?SVC", "t2IT", "CompilerBarrier",
135  "t__brkdiv0")>;
136
137def : InstRW<[WriteNoop], (instregex "VMRS", "VMSR", "FMSTAT")>;
138
139// Specific memory instrs
140def : InstRW<[WriteNoop, WriteNoop], (instregex "(t2)?LDA", "(t2)?LDC", "(t2)?STC",
141  "(t2)?STL", "(t2)?LDREX", "(t2)?STREX", "MEMCPY")>;
142
143// coprocessor moves
144def : InstRW<[WriteNoop, WriteNoop], (instregex
145  "(t2)?MCR(2|R|R2)?$", "(t2)?MRC(2)?$",
146  "(t2)?MRRC(2)?$", "(t2)?MRS(banked|sys|_AR|_M|sys_AR)?$",
147  "(t2)?MSR(banked|i|_AR|_M)?$")>;
148
149// Deprecated instructions
150def : InstRW<[WriteNoop], (instregex "FLDM", "FSTM")>;
151
152// Pseudos
153def : InstRW<[WriteNoop], (instregex "(t2)?ABS$",
154  "(t)?ADJCALLSTACKDOWN$", "(t)?ADJCALLSTACKUP$", "(t2|t)?Int_eh_sjlj",
155  "tLDRpci_pic", "(t2)?SUBS_PC_LR",
156  "JUMPTABLE", "tInt_WIN_eh_sjlj_longjmp",
157  "VLD(1|2)LN(d|q)(WB_fixed_|WB_register_)?Asm",
158  "VLD(3|4)(DUP|LN)?(d|q)(WB_fixed_|WB_register_)?Asm",
159  "VST(1|2)LN(d|q)(WB_fixed_|WB_register_)?Asm",
160  "VST(3|4)(DUP|LN)?(d|q)(WB_fixed_|WB_register_)?Asm",
161  "WIN__CHKSTK", "WIN__DBZCHK")>;
162
163// Miscellaneous
164// -----------------------------------------------------------------------------
165
166def : InstRW<[A57Write_1cyc_1I], (instrs COPY)>;
167
168// --- 3.2 Branch Instructions ---
169// B, BX, BL, BLX (imm, reg != LR, reg == LR), CBZ, CBNZ
170
171def : InstRW<[A57Write_1cyc_1B], (instregex "(t2|t)?B$", "t?BX", "(t2|t)?Bcc$",
172  "t?TAILJMP(d|r)", "TCRETURN(d|r)i", "tBfar", "tCBN?Z")>;
173def : InstRW<[A57Write_1cyc_1B_1I],
174  (instregex "t?BL$", "BL_pred$", "t?BLXi", "t?TPsoft")>;
175def : InstRW<[A57Write_2cyc_1B_1I], (instregex "BLX", "tBLX(NS)?r")>;
176// Pseudos
177def : InstRW<[A57Write_2cyc_1B_1I], (instregex "BCCi64", "BCCZi64")>;
178def : InstRW<[A57Write_3cyc_1B_1I], (instregex "BR_JTadd", "t?BR_JTr",
179  "t2BR_JT", "t2BXJ", "(t2)?TB(B|H)(_JT)?$", "tBRIND")>;
180def : InstRW<[A57Write_6cyc_1B_1L], (instregex "BR_JTm")>;
181
182// --- 3.3 Arithmetic and Logical Instructions ---
183// ADD{S}, ADC{S}, ADR,	AND{S},	BIC{S},	CMN, CMP, EOR{S}, ORN{S}, ORR{S},
184// RSB{S}, RSC{S}, SUB{S}, SBC{S}, TEQ, TST
185
186def : InstRW<[A57Write_1cyc_1I], (instregex "tADDframe")>;
187
188// shift by register, conditional or unconditional
189// TODO: according to the doc, conditional uses I0/I1, unconditional uses M
190// Why more complex instruction uses more simple pipeline?
191// May be an error in doc.
192def A57WriteALUsi : SchedWriteVariant<[
193  // lsl #2, lsl #1, or lsr #1.
194  SchedVar<IsPredicatedPred, [A57Write_2cyc_1M]>,
195  SchedVar<NoSchedPred,      [A57Write_2cyc_1M]>
196]>;
197def A57WriteALUsr : SchedWriteVariant<[
198  SchedVar<IsPredicatedPred, [A57Write_2cyc_1I]>,
199  SchedVar<NoSchedPred,      [A57Write_2cyc_1M]>
200]>;
201def A57WriteALUSsr : SchedWriteVariant<[
202  SchedVar<IsPredicatedPred, [A57Write_2cyc_1I]>,
203  SchedVar<NoSchedPred,      [A57Write_2cyc_1M]>
204]>;
205def A57ReadALUsr : SchedReadVariant<[
206  SchedVar<IsPredicatedPred, [ReadDefault]>,
207  SchedVar<NoSchedPred,      [ReadDefault]>
208]>;
209def : SchedAlias<WriteALUsi,  A57WriteALUsi>;
210def : SchedAlias<WriteALUsr,  A57WriteALUsr>;
211def : SchedAlias<WriteALUSsr, A57WriteALUSsr>;
212def : SchedAlias<ReadALUsr,   A57ReadALUsr>;
213
214def A57WriteCMPsr : SchedWriteVariant<[
215  SchedVar<IsPredicatedPred, [A57Write_2cyc_1I]>,
216  SchedVar<NoSchedPred,      [A57Write_2cyc_1M]>
217]>;
218def : SchedAlias<WriteCMP,   A57Write_1cyc_1I>;
219def : SchedAlias<WriteCMPsi, A57Write_2cyc_1M>;
220def : SchedAlias<WriteCMPsr, A57WriteCMPsr>;
221
222// --- 3.4 Move and Shift Instructions ---
223// Move, basic
224// MOV{S}, MOVW, MVN{S}
225def : InstRW<[A57Write_1cyc_1I], (instregex "MOV(r|i|i16|r_TC)",
226  "(t2)?MVN(CC)?(r|i)", "BMOVPCB_CALL", "BMOVPCRX_CALL",
227  "MOVCC(r|i|i16|i32imm)", "tMOV", "tMVN")>;
228
229// Move, shift by immed, setflags/no setflags
230// (ASR, LSL, LSR, ROR, RRX)=MOVsi, MVN
231// setflags = isCPSRDefined
232def A57WriteMOVsi : SchedWriteVariant<[
233  SchedVar<IsCPSRDefinedPred,              [A57Write_2cyc_1M]>,
234  SchedVar<NoSchedPred,                    [A57Write_1cyc_1I]>
235]>;
236def : InstRW<[A57WriteMOVsi], (instregex "MOV(CC)?si", "MVNsi",
237  "ASRi", "(t2|t)ASRri", "LSRi", "(t2|t)LSRri", "LSLi", "(t2|t)LSLri", "RORi",
238  "(t2|t)RORri", "(t2)?RRX", "t2MOV", "tROR")>;
239
240// shift by register, conditional or unconditional, setflags/no setflags
241def A57WriteMOVsr : SchedWriteVariant<[
242  SchedVar<IsCPSRDefinedAndPredicatedPred, [A57Write_2cyc_1I]>,
243  SchedVar<IsCPSRDefinedPred,              [A57Write_2cyc_1M]>,
244  SchedVar<IsPredicatedPred,               [A57Write_2cyc_1I]>,
245  SchedVar<NoSchedPred,                    [A57Write_1cyc_1I]>
246]>;
247def : InstRW<[A57WriteMOVsr], (instregex "MOV(CC)?sr", "MVNsr", "t2MVNs",
248  "ASRr", "(t2|t)ASRrr", "LSRr", "(t2|t)LSRrr", "LSLr", "(t2|t)?LSLrr", "RORr",
249  "(t2|t)RORrr")>;
250
251// Move, top
252// MOVT - A57Write_2cyc_1M for r0px, A57Write_1cyc_1I for r1p0 and later
253def A57WriteMOVT : SchedWriteVariant<[
254  SchedVar<IsR1P0AndLaterPred,             [A57Write_1cyc_1I]>,
255  SchedVar<NoSchedPred,                    [A57Write_2cyc_1M]>
256]>;
257def : InstRW<[A57WriteMOVT], (instregex "MOVTi16")>;
258
259def A57WriteI2pc :
260  WriteSequence<[A57Write_1cyc_1I, A57Write_1cyc_1I, A57Write_1cyc_1I]>;
261def A57WriteI2ld :
262  WriteSequence<[A57Write_1cyc_1I, A57Write_1cyc_1I, A57Write_4cyc_1L]>;
263def : InstRW< [A57WriteI2pc], (instregex "MOV_ga_pcrel")>;
264def : InstRW< [A57WriteI2ld], (instregex "MOV_ga_pcrel_ldr")>;
265
266// +2cyc for branch forms
267def : InstRW<[A57Write_3cyc_1I], (instregex "MOVPC(LR|RX)")>;
268
269// --- 3.5 Divide and Multiply Instructions ---
270// Divide: SDIV, UDIV
271// latency from documentration: 4 ­‐ 20, maximum taken
272def : SchedAlias<WriteDIV, A57Write_20cyc_1M>;
273// Multiply: tMul not bound to common WriteRes types
274def : InstRW<[A57Write_3cyc_1M], (instregex "tMUL")>;
275def : SchedAlias<WriteMUL16, A57Write_3cyc_1M>;
276def : SchedAlias<WriteMUL32, A57Write_3cyc_1M>;
277def : ReadAdvance<ReadMUL, 0>;
278
279// Multiply accumulate: MLA, MLS, SMLABB, SMLABT, SMLATB, SMLATT, SMLAWB,
280// SMLAWT, SMLAD{X}, SMLSD{X}, SMMLA{R}, SMMLS{R}
281// Multiply-accumulate pipelines support late-forwarding of accumulate operands
282// from similar μops, allowing a typical sequence of multiply-accumulate μops
283// to issue one every 1 cycle (sched advance = 2).
284def A57WriteMLA : SchedWriteRes<[A57UnitM]> { let Latency = 3; }
285def A57WriteMLAL : SchedWriteRes<[A57UnitM]> { let Latency = 4; }
286def A57ReadMLA  : SchedReadAdvance<2, [A57WriteMLA, A57WriteMLAL]>;
287
288def : InstRW<[A57WriteMLA],
289  (instregex "t2SMLAD", "t2SMLADX", "t2SMLSD", "t2SMLSDX")>;
290
291def : SchedAlias<WriteMAC16, A57WriteMLA>;
292def : SchedAlias<WriteMAC32, A57WriteMLA>;
293def : SchedAlias<ReadMAC,    A57ReadMLA>;
294
295def : SchedAlias<WriteMAC64Lo, A57WriteMLAL>;
296def : SchedAlias<WriteMAC64Hi, A57WriteMLAL>;
297
298// Multiply long: SMULL, UMULL
299def : SchedAlias<WriteMUL64Lo, A57Write_4cyc_1M>;
300def : SchedAlias<WriteMUL64Hi, A57Write_4cyc_1M>;
301
302// --- 3.6 Saturating and Parallel Arithmetic Instructions ---
303// Parallel	arith
304// SADD16, SADD8, SSUB16, SSUB8, UADD16, UADD8, USUB16, USUB8
305// Conditional GE-setting instructions require three extra μops
306// and two additional cycles to conditionally update the GE field.
307def A57WriteParArith : SchedWriteVariant<[
308  SchedVar<IsPredicatedPred, [A57Write_4cyc_1I_1M]>,
309  SchedVar<NoSchedPred,      [A57Write_2cyc_1I_1M]>
310]>;
311def : InstRW< [A57WriteParArith], (instregex
312  "(t2)?SADD(16|8)", "(t2)?SSUB(16|8)",
313  "(t2)?UADD(16|8)", "(t2)?USUB(16|8)")>;
314
315// Parallel	arith with exchange: SASX, SSAX, UASX, USAX
316def A57WriteParArithExch : SchedWriteVariant<[
317  SchedVar<IsPredicatedPred, [A57Write_5cyc_1I_1M]>,
318  SchedVar<NoSchedPred,      [A57Write_3cyc_1I_1M]>
319]>;
320def : InstRW<[A57WriteParArithExch],
321  (instregex "(t2)?SASX", "(t2)?SSAX", "(t2)?UASX", "(t2)?USAX")>;
322
323// Parallel	halving	arith
324// SHADD16, SHADD8, SHSUB16, SHSUB8, UHADD16, UHADD8, UHSUB16,	UHSUB8
325def : InstRW<[A57Write_2cyc_1M], (instregex
326  "(t2)?SHADD(16|8)", "(t2)?SHSUB(16|8)",
327  "(t2)?UHADD(16|8)", "(t2)?UHSUB(16|8)")>;
328
329// Parallel halving arith with exchange
330// SHASX, SHSAX, UHASX, UHSAX
331def : InstRW<[A57Write_3cyc_1I_1M], (instregex "(t2)?SHASX", "(t2)?SHSAX",
332  "(t2)?UHASX", "(t2)?UHSAX")>;
333
334// Parallel	saturating arith
335// QADD16, QADD8, QSUB16, QSUB8, UQADD16, UQADD8, UQSUB16, UQSUB8
336def : InstRW<[A57Write_2cyc_1M], (instregex "QADD(16|8)", "QSUB(16|8)",
337  "UQADD(16|8)", "UQSUB(16|8)", "t2(U?)QADD", "t2(U?)QSUB")>;
338
339// Parallel	saturating arith with exchange
340// QASX, QSAX, UQASX, UQSAX
341def : InstRW<[A57Write_3cyc_1I_1M], (instregex "(t2)?QASX", "(t2)?QSAX",
342  "(t2)?UQASX", "(t2)?UQSAX")>;
343
344// Saturate: SSAT, SSAT16, USAT, USAT16
345def : InstRW<[A57Write_2cyc_1M],
346  (instregex "(t2)?SSAT(16)?", "(t2)?USAT(16)?")>;
347
348// Saturating arith: QADD, QSUB
349def : InstRW<[A57Write_2cyc_1M], (instregex "QADD$", "QSUB$")>;
350
351// Saturating doubling arith: QDADD, QDSUB
352def : InstRW<[A57Write_3cyc_1I_1M], (instregex "(t2)?QDADD", "(t2)?QDSUB")>;
353
354// --- 3.7 Miscellaneous Data-Processing Instructions ---
355// Bit field extract: SBFX, UBFX
356def : InstRW<[A57Write_1cyc_1I], (instregex "(t2)?SBFX", "(t2)?UBFX")>;
357
358// Bit field insert/clear: BFI, BFC
359def : InstRW<[A57Write_2cyc_1M], (instregex "(t2)?BFI", "(t2)?BFC")>;
360
361// Select bytes, conditional/unconditional
362def A57WriteSEL : SchedWriteVariant<[
363  SchedVar<IsPredicatedPred, [A57Write_2cyc_1I]>,
364  SchedVar<NoSchedPred,      [A57Write_1cyc_1I]>
365]>;
366def : InstRW<[A57WriteSEL], (instregex "(t2)?SEL")>;
367
368// Sign/zero extend, normal: SXTB, SXTH, UXTB, UXTH
369def : InstRW<[A57Write_1cyc_1I],
370  (instregex "(t2|t)?SXT(B|H)$", "(t2|t)?UXT(B|H)$")>;
371
372// Sign/zero extend and add, normal: SXTAB, SXTAH, UXTAB, UXTAH
373def : InstRW<[A57Write_2cyc_1M],
374  (instregex "(t2)?SXTA(B|H)$", "(t2)?UXTA(B|H)$")>;
375
376// Sign/zero extend and add, parallel: SXTAB16, UXTAB16
377def : InstRW<[A57Write_4cyc_1M], (instregex "(t2)?SXTAB16", "(t2)?UXTAB16")>;
378
379// Sum of absolute differences: USAD8, USADA8
380def : InstRW<[A57Write_3cyc_1M], (instregex "(t2)?USAD8", "(t2)?USADA8")>;
381
382// --- 3.8 Load Instructions ---
383
384// Load, immed offset
385// LDR and LDRB have LDRi12 and LDRBi12 forms for immediate
386def : InstRW<[A57Write_4cyc_1L], (instregex "LDRi12", "LDRBi12",
387  "LDRcp", "(t2|t)?LDRConstPool", "LDRLIT_ga_(pcrel|abs)",
388  "PICLDR", "tLDR")>;
389
390def : InstRW<[A57Write_4cyc_1L],
391  (instregex "t2LDRS?(B|H)?(pcrel|T|i8|i12|pci|pci_pic|s)?$")>;
392
393// For "Load, register offset, minus" we need +1cyc, +1I
394def A57WriteLdrAm3 : SchedWriteVariant<[
395  SchedVar<IsLdrAm3NegRegOffPred, [A57Write_5cyc_1I_1L]>,
396  SchedVar<NoSchedPred,           [A57Write_4cyc_1L]>
397]>;
398def : InstRW<[A57WriteLdrAm3], (instregex "LDR(H|SH|SB)$")>;
399def A57WriteLdrAm3X2 : SchedWriteVariant<[
400  SchedVar<IsLdrAm3NegRegOffPredX2, [A57Write_5cyc_1I_1L]>,
401  SchedVar<NoSchedPred,             [A57Write_4cyc_1L]>
402]>;
403def : InstRW<[A57WriteLdrAm3X2, A57WriteLdrAm3X2], (instregex "LDRD$")>;
404def : InstRW<[A57Write_4cyc_1L, A57Write_4cyc_1L], (instregex "t2LDRDi8")>;
405
406def A57WriteLdrAmLDSTSO : SchedWriteVariant<[
407  SchedVar<IsLdstsoScaledNotOptimalPred, [A57Write_5cyc_1I_1L]>,
408  SchedVar<IsLdstsoMinusRegPred,         [A57Write_5cyc_1I_1L]>,
409  SchedVar<NoSchedPred,                  [A57Write_4cyc_1L]>
410]>;
411def : InstRW<[A57WriteLdrAmLDSTSO], (instregex "LDRrs", "LDRBrs")>;
412
413def A57WrBackOne : SchedWriteRes<[]> {
414  let Latency = 1;
415  let NumMicroOps = 0;
416}
417def A57WrBackTwo : SchedWriteRes<[]> {
418  let Latency = 2;
419  let NumMicroOps = 0;
420}
421def A57WrBackThree : SchedWriteRes<[]> {
422  let Latency = 3;
423  let NumMicroOps = 0;
424}
425
426// --- LDR pre-indexed ---
427// Load, immed pre-indexed (4 cyc for load result, 1 cyc for Base update)
428def : InstRW<[A57Write_4cyc_1L_1I, A57WrBackOne], (instregex "LDR_PRE_IMM",
429  "LDRB_PRE_IMM", "t2LDRB_PRE")>;
430
431// Load, register pre-indexed (4 cyc for load result, 2 cyc for Base update)
432// (5 cyc load result for not-lsl2 scaled)
433def A57WriteLdrAmLDSTSOPre : SchedWriteVariant<[
434  SchedVar<IsLdstsoScaledNotOptimalPredX2, [A57Write_5cyc_1I_1L]>,
435  SchedVar<NoSchedPred,                    [A57Write_4cyc_1L_1I]>
436]>;
437def : InstRW<[A57WriteLdrAmLDSTSOPre, A57WrBackTwo],
438  (instregex "LDR_PRE_REG", "LDRB_PRE_REG")>;
439
440def A57WriteLdrAm3PreWrBack : SchedWriteVariant<[
441  SchedVar<IsLdrAm3RegOffPredX2, [A57WrBackTwo]>,
442  SchedVar<NoSchedPred,          [A57WrBackOne]>
443]>;
444def : InstRW<[A57Write_4cyc_1L, A57WriteLdrAm3PreWrBack],
445  (instregex "LDR(H|SH|SB)_PRE")>;
446def : InstRW<[A57Write_4cyc_1L, A57WrBackOne],
447  (instregex "t2LDR(H|SH|SB)?_PRE")>;
448
449// LDRD pre-indexed: 5(2) cyc for reg, 4(1) cyc for imm.
450def A57WriteLdrDAm3Pre : SchedWriteVariant<[
451  SchedVar<IsLdrAm3RegOffPredX3, [A57Write_5cyc_1I_1L]>,
452  SchedVar<NoSchedPred,          [A57Write_4cyc_1L_1I]>
453]>;
454def A57WriteLdrDAm3PreWrBack : SchedWriteVariant<[
455  SchedVar<IsLdrAm3RegOffPredX3, [A57WrBackTwo]>,
456  SchedVar<NoSchedPred,          [A57WrBackOne]>
457]>;
458def : InstRW<[A57WriteLdrDAm3Pre, A57WriteLdrDAm3Pre, A57WriteLdrDAm3PreWrBack],
459  (instregex "LDRD_PRE")>;
460def : InstRW<[A57Write_4cyc_1L_1I, A57Write_4cyc_1L_1I, A57WrBackOne],
461  (instregex "t2LDRD_PRE")>;
462
463// --- LDR post-indexed ---
464def : InstRW<[A57Write_4cyc_1L_1I, A57WrBackOne], (instregex "LDR(T?)_POST_IMM",
465  "LDRB(T?)_POST_IMM", "LDR(SB|H|SH)Ti", "t2LDRB_POST")>;
466
467def A57WriteLdrAm3PostWrBack : SchedWriteVariant<[
468  SchedVar<IsLdrAm3RegOffPred, [A57WrBackTwo]>,
469  SchedVar<NoSchedPred,        [A57WrBackOne]>
470]>;
471def : InstRW<[A57Write_4cyc_1L_1I, A57WriteLdrAm3PostWrBack],
472  (instregex "LDR(H|SH|SB)_POST")>;
473def : InstRW<[A57Write_4cyc_1L, A57WrBackOne],
474  (instregex "t2LDR(H|SH|SB)?_POST")>;
475
476def : InstRW<[A57Write_4cyc_1L_1I, A57WrBackTwo], (instregex "LDR_POST_REG",
477  "LDRB_POST_REG", "LDR(B?)T_POST$")>;
478
479def A57WriteLdrTRegPost : SchedWriteVariant<[
480  SchedVar<IsLdrAm2ScaledPred, [A57Write_4cyc_1I_1L_1M]>,
481  SchedVar<NoSchedPred,        [A57Write_4cyc_1L_1I]>
482]>;
483def A57WriteLdrTRegPostWrBack : SchedWriteVariant<[
484  SchedVar<IsLdrAm2ScaledPred, [A57WrBackThree]>,
485  SchedVar<NoSchedPred,        [A57WrBackTwo]>
486]>;
487// 4(3) "I0/I1,L,M" for scaled register, otherwise 4(2) "I0/I1,L"
488def : InstRW<[A57WriteLdrTRegPost, A57WriteLdrTRegPostWrBack],
489  (instregex "LDRT_POST_REG", "LDRBT_POST_REG")>;
490
491def : InstRW<[A57Write_4cyc_1L_1I, A57WrBackTwo], (instregex "LDR(SB|H|SH)Tr")>;
492
493def A57WriteLdrAm3PostWrBackX3 : SchedWriteVariant<[
494  SchedVar<IsLdrAm3RegOffPredX3, [A57WrBackTwo]>,
495  SchedVar<NoSchedPred,          [A57WrBackOne]>
496]>;
497// LDRD post-indexed: 4(2) cyc for reg, 4(1) cyc for imm.
498def : InstRW<[A57Write_4cyc_1L_1I, A57Write_4cyc_1L_1I,
499  A57WriteLdrAm3PostWrBackX3], (instregex "LDRD_POST")>;
500def : InstRW<[A57Write_4cyc_1L_1I, A57Write_4cyc_1L_1I, A57WrBackOne],
501  (instregex "t2LDRD_POST")>;
502
503// --- Preload instructions ---
504// Preload, immed offset
505def : InstRW<[A57Write_4cyc_1L], (instregex "(t2)?PLDi12", "(t2)?PLDWi12",
506  "t2PLDW?(i8|pci|s)", "(t2)?PLI")>;
507
508// Preload, register offset,
509// 5cyc "I0/I1,L" for minus reg or scaled not plus lsl2
510// otherwise 4cyc "L"
511def A57WritePLD : SchedWriteVariant<[
512  SchedVar<IsLdstsoScaledNotOptimalPredX0, [A57Write_5cyc_1I_1L]>,
513  SchedVar<IsLdstsoMinusRegPredX0,         [A57Write_5cyc_1I_1L]>,
514  SchedVar<NoSchedPred,                    [A57Write_4cyc_1L]>
515]>;
516def : InstRW<[A57WritePLD], (instregex "PLDrs", "PLDWrs")>;
517
518// --- Load multiple instructions ---
519foreach NumAddr = 1-8 in {
520  def A57LMAddrPred#NumAddr :
521    SchedPredicate<"(TII->getLDMVariableDefsSize(*MI)+1)/2 == "#NumAddr>;
522}
523
524def A57LDMOpsListNoregin : A57WriteLMOpsListType<
525                [A57Write_3cyc_1L, A57Write_3cyc_1L,
526                 A57Write_4cyc_1L, A57Write_4cyc_1L,
527                 A57Write_5cyc_1L, A57Write_5cyc_1L,
528                 A57Write_6cyc_1L, A57Write_6cyc_1L,
529                 A57Write_7cyc_1L, A57Write_7cyc_1L,
530                 A57Write_8cyc_1L, A57Write_8cyc_1L,
531                 A57Write_9cyc_1L, A57Write_9cyc_1L,
532                 A57Write_10cyc_1L, A57Write_10cyc_1L]>;
533def A57WriteLDMnoreginlist : SchedWriteVariant<[
534  SchedVar<A57LMAddrPred1,     A57LDMOpsListNoregin.Writes[0-1]>,
535  SchedVar<A57LMAddrPred2,     A57LDMOpsListNoregin.Writes[0-3]>,
536  SchedVar<A57LMAddrPred3,     A57LDMOpsListNoregin.Writes[0-5]>,
537  SchedVar<A57LMAddrPred4,     A57LDMOpsListNoregin.Writes[0-7]>,
538  SchedVar<A57LMAddrPred5,     A57LDMOpsListNoregin.Writes[0-9]>,
539  SchedVar<A57LMAddrPred6,     A57LDMOpsListNoregin.Writes[0-11]>,
540  SchedVar<A57LMAddrPred7,     A57LDMOpsListNoregin.Writes[0-13]>,
541  SchedVar<A57LMAddrPred8,     A57LDMOpsListNoregin.Writes[0-15]>,
542  SchedVar<NoSchedPred,        A57LDMOpsListNoregin.Writes[0-15]>
543]> { let Variadic=1; }
544
545def A57LDMOpsListRegin : A57WriteLMOpsListType<
546                [A57Write_4cyc_1L_1I, A57Write_4cyc_1L_1I,
547                 A57Write_5cyc_1L_1I, A57Write_5cyc_1L_1I,
548                 A57Write_6cyc_1L_1I, A57Write_6cyc_1L_1I,
549                 A57Write_7cyc_1L_1I, A57Write_7cyc_1L_1I,
550                 A57Write_8cyc_1L_1I, A57Write_8cyc_1L_1I,
551                 A57Write_9cyc_1L_1I, A57Write_9cyc_1L_1I,
552                 A57Write_10cyc_1L_1I, A57Write_10cyc_1L_1I,
553                 A57Write_11cyc_1L_1I, A57Write_11cyc_1L_1I]>;
554def A57WriteLDMreginlist : SchedWriteVariant<[
555  SchedVar<A57LMAddrPred1,     A57LDMOpsListRegin.Writes[0-1]>,
556  SchedVar<A57LMAddrPred2,     A57LDMOpsListRegin.Writes[0-3]>,
557  SchedVar<A57LMAddrPred3,     A57LDMOpsListRegin.Writes[0-5]>,
558  SchedVar<A57LMAddrPred4,     A57LDMOpsListRegin.Writes[0-7]>,
559  SchedVar<A57LMAddrPred5,     A57LDMOpsListRegin.Writes[0-9]>,
560  SchedVar<A57LMAddrPred6,     A57LDMOpsListRegin.Writes[0-11]>,
561  SchedVar<A57LMAddrPred7,     A57LDMOpsListRegin.Writes[0-13]>,
562  SchedVar<A57LMAddrPred8,     A57LDMOpsListRegin.Writes[0-15]>,
563  SchedVar<NoSchedPred,        A57LDMOpsListRegin.Writes[0-15]>
564]> { let Variadic=1; }
565
566def A57LDMOpsList_Upd : A57WriteLMOpsListType<
567              [A57WrBackOne,
568               A57Write_3cyc_1L_1I, A57Write_3cyc_1L_1I,
569               A57Write_4cyc_1L_1I, A57Write_4cyc_1L_1I,
570               A57Write_5cyc_1L_1I, A57Write_5cyc_1L_1I,
571               A57Write_6cyc_1L_1I, A57Write_6cyc_1L_1I,
572               A57Write_7cyc_1L_1I, A57Write_7cyc_1L_1I,
573               A57Write_8cyc_1L_1I, A57Write_8cyc_1L_1I,
574               A57Write_9cyc_1L_1I, A57Write_9cyc_1L_1I,
575               A57Write_10cyc_1L_1I, A57Write_10cyc_1L_1I]>;
576def A57WriteLDM_Upd : SchedWriteVariant<[
577  SchedVar<A57LMAddrPred1,     A57LDMOpsList_Upd.Writes[0-2]>,
578  SchedVar<A57LMAddrPred2,     A57LDMOpsList_Upd.Writes[0-4]>,
579  SchedVar<A57LMAddrPred3,     A57LDMOpsList_Upd.Writes[0-6]>,
580  SchedVar<A57LMAddrPred4,     A57LDMOpsList_Upd.Writes[0-8]>,
581  SchedVar<A57LMAddrPred5,     A57LDMOpsList_Upd.Writes[0-10]>,
582  SchedVar<A57LMAddrPred6,     A57LDMOpsList_Upd.Writes[0-12]>,
583  SchedVar<A57LMAddrPred7,     A57LDMOpsList_Upd.Writes[0-14]>,
584  SchedVar<A57LMAddrPred8,     A57LDMOpsList_Upd.Writes[0-16]>,
585  SchedVar<NoSchedPred,        A57LDMOpsList_Upd.Writes[0-16]>
586]> { let Variadic=1; }
587
588def A57WriteLDM : SchedWriteVariant<[
589  SchedVar<IsLdmBaseRegInList, [A57WriteLDMreginlist]>,
590  SchedVar<NoSchedPred,        [A57WriteLDMnoreginlist]>
591]> { let Variadic=1; }
592
593def : InstRW<[A57WriteLDM], (instregex "(t|t2|sys)?LDM(IA|DA|DB|IB)$")>;
594
595// TODO: no writeback latency defined in documentation (implemented as 1 cyc)
596def : InstRW<[A57WriteLDM_Upd],
597  (instregex "(t|t2|sys)?LDM(IA_UPD|DA_UPD|DB_UPD|IB_UPD|IA_RET)", "tPOP")>;
598
599def : InstRW<[A57Write_5cyc_1L], (instregex "VLLDM")>;
600
601// --- 3.9 Store Instructions ---
602
603// Store, immed offset
604def : InstRW<[A57Write_1cyc_1S], (instregex "STRi12", "STRBi12", "PICSTR",
605  "t2STR(B?)(T|i12|i8|s)", "t2STRDi8", "t2STRH(i12|i8|s)", "tSTR")>;
606
607// Store, register offset
608// For minus or for not plus lsl2 scaled we need 3cyc "I0/I1, S",
609// otherwise 1cyc S.
610def A57WriteStrAmLDSTSO : SchedWriteVariant<[
611  SchedVar<IsLdstsoScaledNotOptimalPred, [A57Write_3cyc_1I_1S]>,
612  SchedVar<IsLdstsoMinusRegPred,         [A57Write_3cyc_1I_1S]>,
613  SchedVar<NoSchedPred,                  [A57Write_1cyc_1S]>
614]>;
615def : InstRW<[A57WriteStrAmLDSTSO], (instregex "STRrs", "STRBrs")>;
616
617// STRH,STRD: 3cyc "I0/I1, S" for minus reg, 1cyc S for imm or for plus reg.
618def A57WriteStrAm3 : SchedWriteVariant<[
619  SchedVar<IsLdrAm3NegRegOffPred, [A57Write_3cyc_1I_1S]>,
620  SchedVar<NoSchedPred,           [A57Write_1cyc_1S]>
621]>;
622def : InstRW<[A57WriteStrAm3], (instregex "STRH$")>;
623def A57WriteStrAm3X2 : SchedWriteVariant<[
624  SchedVar<IsLdrAm3NegRegOffPredX2, [A57Write_3cyc_1I_1S]>,
625  SchedVar<NoSchedPred,             [A57Write_1cyc_1S]>
626]>;
627def : InstRW<[A57WriteStrAm3X2], (instregex "STRD$")>;
628
629// Store, immed pre-indexed (1cyc "S, I0/I1", 1cyc writeback)
630def : InstRW<[A57WrBackOne, A57Write_1cyc_1S_1I], (instregex "STR_PRE_IMM",
631  "STRB_PRE_IMM", "STR(B)?(r|i)_preidx", "(t2)?STRH_(preidx|PRE)",
632  "t2STR(B?)_(PRE|preidx)", "t2STRD_PRE")>;
633
634// Store, register pre-indexed:
635// 1(1) "S, I0/I1" for plus reg
636// 3(2) "I0/I1, S" for minus reg
637// 1(2) "S, M" for scaled plus lsl2
638// 3(2) "I0/I1, S" for other scaled
639def A57WriteStrAmLDSTSOPre : SchedWriteVariant<[
640  SchedVar<IsLdstsoScaledNotOptimalPredX2, [A57Write_3cyc_1I_1S]>,
641  SchedVar<IsLdstsoMinusRegPredX2,         [A57Write_3cyc_1I_1S]>,
642  SchedVar<IsLdstsoScaledPredX2,           [A57Write_1cyc_1S_1M]>,
643  SchedVar<NoSchedPred,                    [A57Write_1cyc_1S_1I]>
644]>;
645def A57WriteStrAmLDSTSOPreWrBack : SchedWriteVariant<[
646  SchedVar<IsLdstsoScaledPredX2,           [A57WrBackTwo]>,
647  SchedVar<IsLdstsoMinusRegPredX2,         [A57WrBackTwo]>,
648  SchedVar<NoSchedPred,                    [A57WrBackOne]>
649]>;
650def : InstRW<[A57WriteStrAmLDSTSOPreWrBack, A57WriteStrAmLDSTSOPre],
651  (instregex "STR_PRE_REG", "STRB_PRE_REG")>;
652
653// pre-indexed STRH/STRD (STRH_PRE, STRD_PRE)
654// 1(1) "S, I0/I1" for imm or reg plus
655// 3(2) "I0/I1, S" for reg minus
656def A57WriteStrAm3PreX2 : SchedWriteVariant<[
657  SchedVar<IsLdrAm3NegRegOffPredX2, [A57Write_3cyc_1I_1S]>,
658  SchedVar<NoSchedPred,             [A57Write_1cyc_1S_1I]>
659]>;
660def A57WriteStrAm3PreWrBackX2 : SchedWriteVariant<[
661  SchedVar<IsLdrAm3NegRegOffPredX2, [A57WrBackTwo]>,
662  SchedVar<NoSchedPred,             [A57WrBackOne]>
663]>;
664def : InstRW<[A57WriteStrAm3PreWrBackX2, A57WriteStrAm3PreX2],
665  (instregex "STRH_PRE")>;
666
667def A57WriteStrAm3PreX3 : SchedWriteVariant<[
668  SchedVar<IsLdrAm3NegRegOffPredX3, [A57Write_3cyc_1I_1S]>,
669  SchedVar<NoSchedPred,             [A57Write_1cyc_1S_1I]>
670]>;
671def A57WriteStrAm3PreWrBackX3 : SchedWriteVariant<[
672  SchedVar<IsLdrAm3NegRegOffPredX3, [A57WrBackTwo]>,
673  SchedVar<NoSchedPred,             [A57WrBackOne]>
674]>;
675def : InstRW<[A57WriteStrAm3PreWrBackX3, A57WriteStrAm3PreX3],
676  (instregex "STRD_PRE")>;
677
678def : InstRW<[A57WrBackOne, A57Write_1cyc_1S_1I], (instregex "STR(T?)_POST_IMM",
679  "STRB(T?)_POST_IMM", "t2STR(B?)_POST")>;
680
681// 1(2) "S, M" for STR/STRB register post-indexed (both scaled or not)
682def : InstRW<[A57WrBackTwo, A57Write_1cyc_1S_1M], (instregex "STR(T?)_POST_REG",
683  "STRB(T?)_POST_REG", "STR(B?)T_POST$")>;
684
685// post-indexed STRH/STRD(STRH_POST, STRD_POST), STRHTi, STRHTr
686// 1(1) "S, I0/I1" both for reg or imm
687def : InstRW<[A57WrBackOne, A57Write_1cyc_1S_1I],
688  (instregex "(t2)?STR(H|D)_POST", "STRHT(i|r)", "t2STRHT")>;
689
690// --- Store multiple instructions ---
691// TODO: no writeback latency defined in documentation
692def A57WriteSTM : SchedWriteVariant<[
693    SchedVar<A57LMAddrPred1, [A57Write_1cyc_1S]>,
694    SchedVar<A57LMAddrPred2, [A57Write_2cyc_1S]>,
695    SchedVar<A57LMAddrPred3, [A57Write_3cyc_1S]>,
696    SchedVar<A57LMAddrPred4, [A57Write_4cyc_1S]>,
697    SchedVar<A57LMAddrPred5, [A57Write_5cyc_1S]>,
698    SchedVar<A57LMAddrPred6, [A57Write_6cyc_1S]>,
699    SchedVar<A57LMAddrPred7, [A57Write_7cyc_1S]>,
700    SchedVar<A57LMAddrPred8, [A57Write_8cyc_1S]>,
701    SchedVar<NoSchedPred,    [A57Write_2cyc_1S]>
702]>;
703def A57WriteSTM_Upd : SchedWriteVariant<[
704    SchedVar<A57LMAddrPred1, [A57Write_1cyc_1S_1I]>,
705    SchedVar<A57LMAddrPred2, [A57Write_2cyc_1S_1I]>,
706    SchedVar<A57LMAddrPred3, [A57Write_3cyc_1S_1I]>,
707    SchedVar<A57LMAddrPred4, [A57Write_4cyc_1S_1I]>,
708    SchedVar<A57LMAddrPred5, [A57Write_5cyc_1S_1I]>,
709    SchedVar<A57LMAddrPred6, [A57Write_6cyc_1S_1I]>,
710    SchedVar<A57LMAddrPred7, [A57Write_7cyc_1S_1I]>,
711    SchedVar<A57LMAddrPred8, [A57Write_8cyc_1S_1I]>,
712    SchedVar<NoSchedPred,    [A57Write_2cyc_1S_1I]>
713]>;
714
715def : InstRW<[A57WriteSTM], (instregex "(t2|sys|t)?STM(IA|DA|DB|IB)$")>;
716def : InstRW<[A57WrBackOne, A57WriteSTM_Upd],
717  (instregex "(t2|sys|t)?STM(IA_UPD|DA_UPD|DB_UPD|IB_UPD)", "tPUSH")>;
718
719def : InstRW<[A57Write_5cyc_1S], (instregex "VLSTM")>;
720
721// --- 3.10 FP Data Processing Instructions ---
722def : SchedAlias<WriteFPALU32, A57Write_5cyc_1V>;
723def : SchedAlias<WriteFPALU64, A57Write_5cyc_1V>;
724
725def : InstRW<[A57Write_3cyc_1V], (instregex "VABS(S|D|H)")>;
726
727// fp compare - 3cyc F1 for unconditional, 6cyc "F0/F1, F1" for conditional
728def A57WriteVcmp : SchedWriteVariant<[
729  SchedVar<IsPredicatedPred, [A57Write_6cyc_1V_1X]>,
730  SchedVar<NoSchedPred,      [A57Write_3cyc_1X]>
731]>;
732def : InstRW<[A57WriteVcmp],
733  (instregex "VCMP(D|S|H|ZD|ZS|ZH)$", "VCMPE(D|S|H|ZD|ZS|ZH)")>;
734
735// fp convert
736def : InstRW<[A57Write_5cyc_1V], (instregex
737  "VCVT(A|N|P|M)(SH|UH|SS|US|SD|UD)", "VCVT(BDH|THD|TDH)")>;
738def : InstRW<[A57Write_5cyc_1V], (instregex "VTOSLS", "VTOUHS", "VTOULS")>;
739def : SchedAlias<WriteFPCVT, A57Write_5cyc_1V>;
740
741def : InstRW<[A57Write_5cyc_1V], (instregex "VJCVT")>;
742
743// FP round to integral
744def : InstRW<[A57Write_5cyc_1V], (instregex "VRINT(A|N|P|M|Z|R|X)(H|S|D)$")>;
745
746// FP divide, FP square root
747def : SchedAlias<WriteFPDIV32, A57Write_17cyc_1W>;
748def : SchedAlias<WriteFPDIV64, A57Write_32cyc_1W>;
749def : SchedAlias<WriteFPSQRT32, A57Write_17cyc_1W>;
750def : SchedAlias<WriteFPSQRT64, A57Write_32cyc_1W>;
751
752def : InstRW<[A57Write_17cyc_1W], (instregex "VSQRTH")>;
753
754// FP max/min
755def : InstRW<[A57Write_5cyc_1V], (instregex "VMAX", "VMIN")>;
756
757// FP multiply-accumulate pipelines support late forwarding of the result
758// from FP multiply μops to the accumulate operands of an
759// FP multiply-accumulate μop. The latter can potentially be issued 1 cycle
760// after the FP multiply μop has been issued
761// FP multiply, FZ
762def A57WriteVMUL : SchedWriteRes<[A57UnitV]> { let Latency = 5; }
763
764def : SchedAlias<WriteFPMUL32, A57WriteVMUL>;
765def : SchedAlias<WriteFPMUL64, A57WriteVMUL>;
766def : ReadAdvance<ReadFPMUL, 0>;
767
768// FP multiply accumulate, FZ: 9cyc "F0/F1" or 4 cyc for sequenced accumulate
769// VFMA, VFMS, VFNMA, VFNMS, VMLA, VMLS, VNMLA, VNMLS
770def A57WriteVFMA : SchedWriteRes<[A57UnitV]> { let Latency = 9;  }
771
772// VFMA takes 9 cyc for common case and 4 cyc for VFMA->VFMA chain (5 read adv.)
773// VMUL takes 5 cyc for common case and 1 cyc for VMUL->VFMA chain (4 read adv.)
774// Currently, there is no way to define different read advances for VFMA operand
775// from VFMA or from VMUL, so there will be 5 read advance.
776// Zero latency (instead of one) for VMUL->VFMA shouldn't break something.
777// The same situation with ASIMD VMUL/VFMA instructions
778// def A57ReadVFMA : SchedRead;
779// def : ReadAdvance<A57ReadVFMA, 5, [A57WriteVFMA]>;
780// def : ReadAdvance<A57ReadVFMA, 4, [A57WriteVMUL]>;
781def A57ReadVFMA5 : SchedReadAdvance<5, [A57WriteVFMA, A57WriteVMUL]>;
782
783def : SchedAlias<WriteFPMAC32, A57WriteVFMA>;
784def : SchedAlias<WriteFPMAC64, A57WriteVFMA>;
785def : SchedAlias<ReadFPMAC, A57ReadVFMA5>;
786
787// VMLAH/VMLSH are not binded to scheduling classes by default, so here custom:
788def : InstRW<[A57WriteVFMA, A57ReadVFMA5, ReadFPMUL, ReadFPMUL],
789  (instregex "VMLAH", "VMLSH", "VNMLAH", "VNMLSH")>;
790
791def : InstRW<[A57WriteVMUL],
792  (instregex "VUDOTD", "VSDOTD", "VUDOTQ", "VSDOTQ")>;
793
794def : InstRW<[A57Write_3cyc_1V], (instregex "VNEG")>;
795def : InstRW<[A57Write_3cyc_1V], (instregex "VSEL")>;
796
797// --- 3.11 FP Miscellaneous Instructions ---
798// VMOV: 3cyc "F0/F1" for imm/reg
799def : InstRW<[A57Write_3cyc_1V], (instregex "FCONST(D|S|H)")>;
800def : InstRW<[A57Write_3cyc_1V], (instregex "VMOV(D|S|H)(cc)?$")>;
801
802def : InstRW<[A57Write_3cyc_1V], (instregex "VINSH")>;
803
804// 5cyc L for FP transfer, vfp to core reg,
805// 5cyc L for FP transfer, core reg to vfp
806def : SchedAlias<WriteFPMOV, A57Write_5cyc_1L>;
807// VMOVRRS/VMOVRRD in common code declared with one WriteFPMOV (instead of 2).
808def : InstRW<[A57Write_5cyc_1L, A57Write_5cyc_1L], (instregex "VMOV(RRS|RRD)")>;
809
810// 8cyc "L,F0/F1" for FP transfer, core reg to upper or lower half of vfp D-reg
811def : InstRW<[A57Write_8cyc_1L_1I], (instregex "VMOVDRR")>;
812
813// --- 3.12 FP Load Instructions ---
814def : InstRW<[A57Write_5cyc_1L], (instregex "VLDR(D|S|H)")>;
815
816def : InstRW<[A57Write_5cyc_1L], (instregex "VLDMQIA$")>;
817
818// FP load multiple (VLDM)
819
820def A57VLDMOpsListUncond : A57WriteLMOpsListType<
821               [A57Write_5cyc_1L, A57Write_5cyc_1L,
822                A57Write_6cyc_1L, A57Write_6cyc_1L,
823                A57Write_7cyc_1L, A57Write_7cyc_1L,
824                A57Write_8cyc_1L, A57Write_8cyc_1L,
825                A57Write_9cyc_1L, A57Write_9cyc_1L,
826                A57Write_10cyc_1L, A57Write_10cyc_1L,
827                A57Write_11cyc_1L, A57Write_11cyc_1L,
828                A57Write_12cyc_1L, A57Write_12cyc_1L]>;
829def A57WriteVLDMuncond : SchedWriteVariant<[
830  SchedVar<A57LMAddrPred1,  A57VLDMOpsListUncond.Writes[0-1]>,
831  SchedVar<A57LMAddrPred2,  A57VLDMOpsListUncond.Writes[0-3]>,
832  SchedVar<A57LMAddrPred3,  A57VLDMOpsListUncond.Writes[0-5]>,
833  SchedVar<A57LMAddrPred4,  A57VLDMOpsListUncond.Writes[0-7]>,
834  SchedVar<A57LMAddrPred5,  A57VLDMOpsListUncond.Writes[0-9]>,
835  SchedVar<A57LMAddrPred6,  A57VLDMOpsListUncond.Writes[0-11]>,
836  SchedVar<A57LMAddrPred7,  A57VLDMOpsListUncond.Writes[0-13]>,
837  SchedVar<A57LMAddrPred8,  A57VLDMOpsListUncond.Writes[0-15]>,
838  SchedVar<NoSchedPred,     A57VLDMOpsListUncond.Writes[0-15]>
839]> { let Variadic=1; }
840
841def A57VLDMOpsListCond : A57WriteLMOpsListType<
842               [A57Write_5cyc_1L, A57Write_6cyc_1L,
843                A57Write_7cyc_1L, A57Write_8cyc_1L,
844                A57Write_9cyc_1L, A57Write_10cyc_1L,
845                A57Write_11cyc_1L, A57Write_12cyc_1L,
846                A57Write_13cyc_1L, A57Write_14cyc_1L,
847                A57Write_15cyc_1L, A57Write_16cyc_1L,
848                A57Write_17cyc_1L, A57Write_18cyc_1L,
849                A57Write_19cyc_1L, A57Write_20cyc_1L]>;
850def A57WriteVLDMcond : SchedWriteVariant<[
851  SchedVar<A57LMAddrPred1,  A57VLDMOpsListCond.Writes[0-1]>,
852  SchedVar<A57LMAddrPred2,  A57VLDMOpsListCond.Writes[0-3]>,
853  SchedVar<A57LMAddrPred3,  A57VLDMOpsListCond.Writes[0-5]>,
854  SchedVar<A57LMAddrPred4,  A57VLDMOpsListCond.Writes[0-7]>,
855  SchedVar<A57LMAddrPred5,  A57VLDMOpsListCond.Writes[0-9]>,
856  SchedVar<A57LMAddrPred6,  A57VLDMOpsListCond.Writes[0-11]>,
857  SchedVar<A57LMAddrPred7,  A57VLDMOpsListCond.Writes[0-13]>,
858  SchedVar<A57LMAddrPred8,  A57VLDMOpsListCond.Writes[0-15]>,
859  SchedVar<NoSchedPred,     A57VLDMOpsListCond.Writes[0-15]>
860]> { let Variadic=1; }
861
862def A57WriteVLDM : SchedWriteVariant<[
863  SchedVar<IsPredicatedPred, [A57WriteVLDMcond]>,
864  SchedVar<NoSchedPred,      [A57WriteVLDMuncond]>
865]> { let Variadic=1; }
866
867def : InstRW<[A57WriteVLDM], (instregex "VLDM(DIA|SIA)$")>;
868
869def A57VLDMOpsListUncond_Upd : A57WriteLMOpsListType<
870               [A57Write_5cyc_1L_1I, A57Write_5cyc_1L_1I,
871                A57Write_6cyc_1L_1I, A57Write_6cyc_1L_1I,
872                A57Write_7cyc_1L_1I, A57Write_7cyc_1L_1I,
873                A57Write_8cyc_1L_1I, A57Write_8cyc_1L_1I,
874                A57Write_9cyc_1L_1I, A57Write_9cyc_1L_1I,
875                A57Write_10cyc_1L_1I, A57Write_10cyc_1L_1I,
876                A57Write_11cyc_1L_1I, A57Write_11cyc_1L_1I,
877                A57Write_12cyc_1L_1I, A57Write_12cyc_1L_1I]>;
878def A57WriteVLDMuncond_UPD : SchedWriteVariant<[
879  SchedVar<A57LMAddrPred1,  A57VLDMOpsListUncond_Upd.Writes[0-1]>,
880  SchedVar<A57LMAddrPred2,  A57VLDMOpsListUncond_Upd.Writes[0-3]>,
881  SchedVar<A57LMAddrPred3,  A57VLDMOpsListUncond_Upd.Writes[0-5]>,
882  SchedVar<A57LMAddrPred4,  A57VLDMOpsListUncond_Upd.Writes[0-7]>,
883  SchedVar<A57LMAddrPred5,  A57VLDMOpsListUncond_Upd.Writes[0-9]>,
884  SchedVar<A57LMAddrPred6,  A57VLDMOpsListUncond_Upd.Writes[0-11]>,
885  SchedVar<A57LMAddrPred7,  A57VLDMOpsListUncond_Upd.Writes[0-13]>,
886  SchedVar<A57LMAddrPred8,  A57VLDMOpsListUncond_Upd.Writes[0-15]>,
887  SchedVar<NoSchedPred,     A57VLDMOpsListUncond_Upd.Writes[0-15]>
888]> { let Variadic=1; }
889
890def A57VLDMOpsListCond_Upd : A57WriteLMOpsListType<
891               [A57Write_5cyc_1L_1I, A57Write_6cyc_1L_1I,
892                A57Write_7cyc_1L_1I, A57Write_8cyc_1L_1I,
893                A57Write_9cyc_1L_1I, A57Write_10cyc_1L_1I,
894                A57Write_11cyc_1L_1I, A57Write_12cyc_1L_1I,
895                A57Write_13cyc_1L_1I, A57Write_14cyc_1L_1I,
896                A57Write_15cyc_1L_1I, A57Write_16cyc_1L_1I,
897                A57Write_17cyc_1L_1I, A57Write_18cyc_1L_1I,
898                A57Write_19cyc_1L_1I, A57Write_20cyc_1L_1I]>;
899def A57WriteVLDMcond_UPD : SchedWriteVariant<[
900  SchedVar<A57LMAddrPred1,  A57VLDMOpsListCond_Upd.Writes[0-1]>,
901  SchedVar<A57LMAddrPred2,  A57VLDMOpsListCond_Upd.Writes[0-3]>,
902  SchedVar<A57LMAddrPred3,  A57VLDMOpsListCond_Upd.Writes[0-5]>,
903  SchedVar<A57LMAddrPred4,  A57VLDMOpsListCond_Upd.Writes[0-7]>,
904  SchedVar<A57LMAddrPred5,  A57VLDMOpsListCond_Upd.Writes[0-9]>,
905  SchedVar<A57LMAddrPred6,  A57VLDMOpsListCond_Upd.Writes[0-11]>,
906  SchedVar<A57LMAddrPred7,  A57VLDMOpsListCond_Upd.Writes[0-13]>,
907  SchedVar<A57LMAddrPred8,  A57VLDMOpsListCond_Upd.Writes[0-15]>,
908  SchedVar<NoSchedPred,     A57VLDMOpsListCond_Upd.Writes[0-15]>
909]> { let Variadic=1; }
910
911def A57WriteVLDM_UPD : SchedWriteVariant<[
912  SchedVar<IsPredicatedPred, [A57WriteVLDMcond_UPD]>,
913  SchedVar<NoSchedPred,      [A57WriteVLDMuncond_UPD]>
914]> { let Variadic=1; }
915
916def : InstRW<[A57WrBackOne, A57WriteVLDM_UPD],
917  (instregex "VLDM(DIA_UPD|DDB_UPD|SIA_UPD|SDB_UPD)")>;
918
919// --- 3.13 FP Store Instructions ---
920def : InstRW<[A57Write_1cyc_1S], (instregex "VSTR(D|S|H)")>;
921
922def : InstRW<[A57Write_2cyc_1S], (instregex "VSTMQIA$")>;
923
924def A57WriteVSTMs : SchedWriteVariant<[
925    SchedVar<A57LMAddrPred1, [A57Write_1cyc_1S]>,
926    SchedVar<A57LMAddrPred2, [A57Write_2cyc_1S]>,
927    SchedVar<A57LMAddrPred3, [A57Write_3cyc_1S]>,
928    SchedVar<A57LMAddrPred4, [A57Write_4cyc_1S]>,
929    SchedVar<A57LMAddrPred5, [A57Write_5cyc_1S]>,
930    SchedVar<A57LMAddrPred6, [A57Write_6cyc_1S]>,
931    SchedVar<A57LMAddrPred7, [A57Write_7cyc_1S]>,
932    SchedVar<A57LMAddrPred8, [A57Write_8cyc_1S]>,
933    SchedVar<NoSchedPred,    [A57Write_2cyc_1S]>
934]>;
935def A57WriteVSTMd : SchedWriteVariant<[
936    SchedVar<A57LMAddrPred1, [A57Write_2cyc_1S]>,
937    SchedVar<A57LMAddrPred2, [A57Write_4cyc_1S]>,
938    SchedVar<A57LMAddrPred3, [A57Write_6cyc_1S]>,
939    SchedVar<A57LMAddrPred4, [A57Write_8cyc_1S]>,
940    SchedVar<A57LMAddrPred5, [A57Write_10cyc_1S]>,
941    SchedVar<A57LMAddrPred6, [A57Write_12cyc_1S]>,
942    SchedVar<A57LMAddrPred7, [A57Write_14cyc_1S]>,
943    SchedVar<A57LMAddrPred8, [A57Write_16cyc_1S]>,
944    SchedVar<NoSchedPred,    [A57Write_4cyc_1S]>
945]>;
946def A57WriteVSTMs_Upd : SchedWriteVariant<[
947    SchedVar<A57LMAddrPred1, [A57Write_1cyc_1S_1I]>,
948    SchedVar<A57LMAddrPred2, [A57Write_2cyc_1S_1I]>,
949    SchedVar<A57LMAddrPred3, [A57Write_3cyc_1S_1I]>,
950    SchedVar<A57LMAddrPred4, [A57Write_4cyc_1S_1I]>,
951    SchedVar<A57LMAddrPred5, [A57Write_5cyc_1S_1I]>,
952    SchedVar<A57LMAddrPred6, [A57Write_6cyc_1S_1I]>,
953    SchedVar<A57LMAddrPred7, [A57Write_7cyc_1S_1I]>,
954    SchedVar<A57LMAddrPred8, [A57Write_8cyc_1S_1I]>,
955    SchedVar<NoSchedPred,    [A57Write_2cyc_1S_1I]>
956]>;
957def A57WriteVSTMd_Upd : SchedWriteVariant<[
958    SchedVar<A57LMAddrPred1, [A57Write_2cyc_1S_1I]>,
959    SchedVar<A57LMAddrPred2, [A57Write_4cyc_1S_1I]>,
960    SchedVar<A57LMAddrPred3, [A57Write_6cyc_1S_1I]>,
961    SchedVar<A57LMAddrPred4, [A57Write_8cyc_1S_1I]>,
962    SchedVar<A57LMAddrPred5, [A57Write_10cyc_1S_1I]>,
963    SchedVar<A57LMAddrPred6, [A57Write_12cyc_1S_1I]>,
964    SchedVar<A57LMAddrPred7, [A57Write_14cyc_1S_1I]>,
965    SchedVar<A57LMAddrPred8, [A57Write_16cyc_1S_1I]>,
966    SchedVar<NoSchedPred,    [A57Write_2cyc_1S_1I]>
967]>;
968
969def : InstRW<[A57WriteVSTMs], (instregex "VSTMSIA$")>;
970def : InstRW<[A57WriteVSTMd], (instregex "VSTMDIA$")>;
971def : InstRW<[A57WrBackOne, A57WriteVSTMs_Upd],
972  (instregex "VSTM(SIA_UPD|SDB_UPD)")>;
973def : InstRW<[A57WrBackOne, A57WriteVSTMd_Upd],
974  (instregex "VSTM(DIA_UPD|DDB_UPD)")>;
975
976// --- 3.14 ASIMD Integer Instructions ---
977
978// ASIMD absolute diff, 3cyc F0/F1 for integer VABD
979def : InstRW<[A57Write_3cyc_1V], (instregex "VABD(s|u)")>;
980
981// ASIMD absolute diff accum: 4(1) F1 for D-form, 5(2) F1 for Q-form
982def A57WriteVABAD : SchedWriteRes<[A57UnitX]> { let Latency = 4; }
983def A57ReadVABAD  : SchedReadAdvance<3, [A57WriteVABAD]>;
984def : InstRW<[A57WriteVABAD, A57ReadVABAD],
985  (instregex "VABA(s|u)(v8i8|v4i16|v2i32)")>;
986def A57WriteVABAQ : SchedWriteRes<[A57UnitX]> { let Latency = 5; }
987def A57ReadVABAQ  : SchedReadAdvance<3, [A57WriteVABAQ]>;
988def : InstRW<[A57WriteVABAQ, A57ReadVABAQ],
989  (instregex "VABA(s|u)(v16i8|v8i16|v4i32)")>;
990
991// ASIMD absolute diff accum long: 4(1) F1 for VABAL
992def A57WriteVABAL : SchedWriteRes<[A57UnitX]> { let Latency = 4; }
993def A57ReadVABAL  : SchedReadAdvance<3, [A57WriteVABAL]>;
994def : InstRW<[A57WriteVABAL, A57ReadVABAL], (instregex "VABAL(s|u)")>;
995
996// ASIMD absolute diff long: 3cyc F0/F1 for VABDL
997def : InstRW<[A57Write_3cyc_1V], (instregex "VABDL(s|u)")>;
998
999// ASIMD arith, basic
1000def : InstRW<[A57Write_3cyc_1V], (instregex "VADDv", "VADDL", "VADDW",
1001  "VNEG(s8d|s16d|s32d|s8q|s16q|s32q|d|q)",
1002  "VPADDi", "VPADDL", "VSUBv", "VSUBL", "VSUBW")>;
1003
1004// ASIMD arith, complex
1005def : InstRW<[A57Write_3cyc_1V], (instregex "VABS", "VADDHN", "VHADD", "VHSUB",
1006  "VQABS", "VQADD", "VQNEG", "VQSUB",
1007  "VRADDHN", "VRHADD", "VRSUBHN", "VSUBHN")>;
1008
1009// ASIMD compare
1010def : InstRW<[A57Write_3cyc_1V],
1011  (instregex "VCEQ", "VCGE", "VCGT", "VCLE", "VTST", "VCLT")>;
1012
1013// ASIMD logical
1014def : InstRW<[A57Write_3cyc_1V],
1015  (instregex "VAND", "VBIC", "VMVN", "VORR", "VORN", "VEOR")>;
1016
1017// ASIMD max/min
1018def : InstRW<[A57Write_3cyc_1V],
1019  (instregex "(VMAX|VMIN)(s|u)", "(VPMAX|VPMIN)(s8|s16|s32|u8|u16|u32)")>;
1020
1021// ASIMD multiply, D-form: 5cyc F0 for r0px, 4cyc F0 for r1p0 and later
1022// Cortex-A57 r1p0 and later reduce the latency of ASIMD multiply
1023// and multiply-with-accumulate instructions relative to r0pX.
1024def A57WriteVMULD_VecInt : SchedWriteVariant<[
1025  SchedVar<IsR1P0AndLaterPred, [A57Write_4cyc_1W]>,
1026  SchedVar<NoSchedPred,        [A57Write_5cyc_1W]>]>;
1027def : InstRW<[A57WriteVMULD_VecInt], (instregex
1028  "VMUL(v8i8|v4i16|v2i32|pd)", "VMULsl(v4i16|v2i32)",
1029  "VQDMULH(sl)?(v4i16|v2i32)", "VQRDMULH(sl)?(v4i16|v2i32)")>;
1030
1031// ASIMD multiply, Q-form: 6cyc F0 for r0px, 5cyc F0 for r1p0 and later
1032def A57WriteVMULQ_VecInt : SchedWriteVariant<[
1033  SchedVar<IsR1P0AndLaterPred, [A57Write_5cyc_1W]>,
1034  SchedVar<NoSchedPred,        [A57Write_6cyc_1W]>]>;
1035def : InstRW<[A57WriteVMULQ_VecInt], (instregex
1036  "VMUL(v16i8|v8i16|v4i32|pq)", "VMULsl(v8i16|v4i32)",
1037  "VQDMULH(sl)?(v8i16|v4i32)", "VQRDMULH(sl)?(v8i16|v4i32)")>;
1038
1039// ASIMD multiply accumulate, D-form
1040// 5cyc F0 for r0px, 4cyc F0 for r1p0 and later, 1cyc for accumulate sequence
1041// (4 or 3 ReadAdvance)
1042def A57WriteVMLAD_VecInt : SchedWriteVariant<[
1043  SchedVar<IsR1P0AndLaterPred, [A57Write_4cyc_1W]>,
1044  SchedVar<NoSchedPred,        [A57Write_5cyc_1W]>]>;
1045def A57ReadVMLAD_VecInt : SchedReadVariant<[
1046  SchedVar<IsR1P0AndLaterPred, [SchedReadAdvance<3, [A57WriteVMLAD_VecInt]>]>,
1047  SchedVar<NoSchedPred,        [SchedReadAdvance<4, [A57WriteVMLAD_VecInt]>]>
1048]>;
1049def : InstRW<[A57WriteVMLAD_VecInt, A57ReadVMLAD_VecInt],
1050  (instregex "VMLA(sl)?(v8i8|v4i16|v2i32)", "VMLS(sl)?(v8i8|v4i16|v2i32)")>;
1051
1052// ASIMD multiply accumulate, Q-form
1053// 6cyc F0 for r0px, 5cyc F0 for r1p0 and later, 2cyc for accumulate sequence
1054// (4 or 3 ReadAdvance)
1055def A57WriteVMLAQ_VecInt : SchedWriteVariant<[
1056  SchedVar<IsR1P0AndLaterPred, [A57Write_5cyc_1W]>,
1057  SchedVar<NoSchedPred,        [A57Write_6cyc_1W]>]>;
1058def A57ReadVMLAQ_VecInt : SchedReadVariant<[
1059  SchedVar<IsR1P0AndLaterPred, [SchedReadAdvance<3, [A57WriteVMLAQ_VecInt]>]>,
1060  SchedVar<NoSchedPred,        [SchedReadAdvance<4, [A57WriteVMLAQ_VecInt]>]>
1061]>;
1062def : InstRW<[A57WriteVMLAQ_VecInt, A57ReadVMLAQ_VecInt],
1063  (instregex "VMLA(sl)?(v16i8|v8i16|v4i32)", "VMLS(sl)?(v16i8|v8i16|v4i32)")>;
1064
1065// ASIMD multiply accumulate long
1066// 5cyc F0 for r0px, 4cyc F0 for r1p0 and later, 1cyc for accumulate sequence
1067// (4 or 3 ReadAdvance)
1068def A57WriteVMLAL_VecInt : SchedWriteVariant<[
1069  SchedVar<IsR1P0AndLaterPred, [A57Write_4cyc_1W]>,
1070  SchedVar<NoSchedPred,        [A57Write_5cyc_1W]>]>;
1071def A57ReadVMLAL_VecInt : SchedReadVariant<[
1072  SchedVar<IsR1P0AndLaterPred, [SchedReadAdvance<3, [A57WriteVMLAL_VecInt]>]>,
1073  SchedVar<NoSchedPred,        [SchedReadAdvance<4, [A57WriteVMLAL_VecInt]>]>
1074]>;
1075def : InstRW<[A57WriteVMLAL_VecInt, A57ReadVMLAL_VecInt],
1076  (instregex "VMLAL(s|u)", "VMLSL(s|u)")>;
1077
1078// ASIMD multiply accumulate saturating long
1079// 5cyc F0 for r0px, 4cyc F0 for r1p0 and later, 2cyc for accumulate sequence
1080// (3 or 2 ReadAdvance)
1081def A57WriteVQDMLAL_VecInt : SchedWriteVariant<[
1082  SchedVar<IsR1P0AndLaterPred, [A57Write_4cyc_1W]>,
1083  SchedVar<NoSchedPred,        [A57Write_5cyc_1W]>]>;
1084def A57ReadVQDMLAL_VecInt : SchedReadVariant<[
1085  SchedVar<IsR1P0AndLaterPred, [SchedReadAdvance<2, [A57WriteVQDMLAL_VecInt]>]>,
1086  SchedVar<NoSchedPred,        [SchedReadAdvance<3, [A57WriteVQDMLAL_VecInt]>]>
1087]>;
1088def : InstRW<[A57WriteVQDMLAL_VecInt, A57ReadVQDMLAL_VecInt],
1089  (instregex "VQDMLAL", "VQDMLSL")>;
1090
1091// Vector Saturating Rounding Doubling Multiply Accumulate/Subtract Long
1092// Scheduling info from VQDMLAL/VQDMLSL
1093def : InstRW<[A57WriteVQDMLAL_VecInt, A57ReadVQDMLAL_VecInt],
1094  (instregex "VQRDMLAH", "VQRDMLSH")>;
1095
1096// ASIMD multiply long
1097// 5cyc F0 for r0px, 4cyc F0 for r1p0 and later
1098def A57WriteVMULL_VecInt : SchedWriteVariant<[
1099  SchedVar<IsR1P0AndLaterPred, [A57Write_4cyc_1W]>,
1100  SchedVar<NoSchedPred,        [A57Write_5cyc_1W]>]>;
1101def : InstRW<[A57WriteVMULL_VecInt],
1102  (instregex "VMULL(s|u|p8|sls|slu)", "VQDMULL")>;
1103
1104// ASIMD pairwise add and accumulate
1105// 4cyc F1, 1cyc for accumulate sequence (3cyc ReadAdvance)
1106def A57WriteVPADAL : SchedWriteRes<[A57UnitX]> { let Latency = 4; }
1107def A57ReadVPADAL  : SchedReadAdvance<3, [A57WriteVPADAL]>;
1108def : InstRW<[A57WriteVPADAL, A57ReadVPADAL], (instregex "VPADAL(s|u)")>;
1109
1110// ASIMD shift accumulate
1111// 4cyc F1, 1cyc for accumulate sequence (3cyc ReadAdvance)
1112def A57WriteVSRA : SchedWriteRes<[A57UnitX]> { let Latency = 4;  }
1113def A57ReadVSRA  : SchedReadAdvance<3, [A57WriteVSRA]>;
1114def : InstRW<[A57WriteVSRA, A57ReadVSRA], (instregex "VSRA", "VRSRA")>;
1115
1116// ASIMD shift by immed, basic
1117def : InstRW<[A57Write_3cyc_1X],
1118  (instregex "VMOVL", "VSHLi", "VSHLL", "VSHR(s|u)", "VSHRN")>;
1119
1120// ASIMD shift by immed, complex
1121def : InstRW<[A57Write_4cyc_1X], (instregex
1122  "VQRSHRN", "VQRSHRUN", "VQSHL(si|ui|su)", "VQSHRN", "VQSHRUN", "VRSHR(s|u)",
1123  "VRSHRN")>;
1124
1125// ASIMD shift by immed and insert, basic, D-form
1126def : InstRW<[A57Write_4cyc_1X], (instregex
1127  "VSLI(v8i8|v4i16|v2i32|v1i64)", "VSRI(v8i8|v4i16|v2i32|v1i64)")>;
1128
1129// ASIMD shift by immed and insert, basic, Q-form
1130def : InstRW<[A57Write_5cyc_1X], (instregex
1131  "VSLI(v16i8|v8i16|v4i32|v2i64)", "VSRI(v16i8|v8i16|v4i32|v2i64)")>;
1132
1133// ASIMD shift by register, basic, D-form
1134def : InstRW<[A57Write_3cyc_1X], (instregex
1135  "VSHL(s|u)(v8i8|v4i16|v2i32|v1i64)")>;
1136
1137// ASIMD shift by register, basic, Q-form
1138def : InstRW<[A57Write_4cyc_1X], (instregex
1139  "VSHL(s|u)(v16i8|v8i16|v4i32|v2i64)")>;
1140
1141// ASIMD shift by register, complex, D-form
1142// VQRSHL, VQSHL, VRSHL
1143def : InstRW<[A57Write_4cyc_1X], (instregex
1144  "VQRSHL(s|u)(v8i8|v4i16|v2i32|v1i64)", "VQSHL(s|u)(v8i8|v4i16|v2i32|v1i64)",
1145  "VRSHL(s|u)(v8i8|v4i16|v2i32|v1i64)")>;
1146
1147// ASIMD shift by register, complex, Q-form
1148def : InstRW<[A57Write_5cyc_1X], (instregex
1149  "VQRSHL(s|u)(v16i8|v8i16|v4i32|v2i64)", "VQSHL(s|u)(v16i8|v8i16|v4i32|v2i64)",
1150  "VRSHL(s|u)(v16i8|v8i16|v4i32|v2i64)")>;
1151
1152// --- 3.15 ASIMD Floating-Point Instructions ---
1153// ASIMD FP absolute value
1154def : InstRW<[A57Write_3cyc_1V], (instregex "VABS(fd|fq|hd|hq)")>;
1155
1156// ASIMD FP arith
1157def : InstRW<[A57Write_5cyc_1V], (instregex "VABD(fd|fq|hd|hq)",
1158  "VADD(fd|fq|hd|hq)", "VPADD(f|h)", "VSUB(fd|fq|hd|hq)")>;
1159
1160def : InstRW<[A57Write_5cyc_1V], (instregex "VCADD", "VCMLA")>;
1161
1162// ASIMD FP compare
1163def : InstRW<[A57Write_5cyc_1V], (instregex "VAC(GE|GT|LE|LT)",
1164  "VC(EQ|GE|GT|LE)(fd|fq|hd|hq)")>;
1165
1166// ASIMD FP convert, integer
1167def : InstRW<[A57Write_5cyc_1V], (instregex
1168  "VCVT(f2sd|f2ud|s2fd|u2fd|f2sq|f2uq|s2fq|u2fq|f2xsd|f2xud|xs2fd|xu2fd)",
1169  "VCVT(f2xsq|f2xuq|xs2fq|xu2fq)",
1170  "VCVT(AN|MN|NN|PN)(SDf|SQf|UDf|UQf|SDh|SQh|UDh|UQh)")>;
1171
1172// ASIMD FP convert, half-precision: 8cyc F0/F1
1173def : InstRW<[A57Write_8cyc_1V], (instregex
1174  "VCVT(h2sd|h2ud|s2hd|u2hd|h2sq|h2uq|s2hq|u2hq|h2xsd|h2xud|xs2hd|xu2hd)",
1175  "VCVT(h2xsq|h2xuq|xs2hq|xu2hq)",
1176  "VCVT(f2h|h2f)")>;
1177
1178// ASIMD FP max/min
1179def : InstRW<[A57Write_5cyc_1V], (instregex
1180  "(VMAX|VMIN)(fd|fq|hd|hq)", "(VPMAX|VPMIN)(f|h)", "(NEON|VFP)_VMAXNM",
1181  "(NEON|VFP)_VMINNM")>;
1182
1183// ASIMD FP multiply
1184def A57WriteVMUL_VecFP  : SchedWriteRes<[A57UnitV]> { let Latency = 5;  }
1185def : InstRW<[A57WriteVMUL_VecFP], (instregex "VMUL(sl)?(fd|fq|hd|hq)")>;
1186
1187// ASIMD FP multiply accumulate: 9cyc F0/F1, 4cyc for accumulate sequence
1188def A57WriteVMLA_VecFP  : SchedWriteRes<[A57UnitV]> { let Latency = 9;  }
1189def A57ReadVMLA_VecFP  :
1190  SchedReadAdvance<5, [A57WriteVMLA_VecFP, A57WriteVMUL_VecFP]>;
1191def : InstRW<[A57WriteVMLA_VecFP, A57ReadVMLA_VecFP],
1192  (instregex "(VMLA|VMLS)(sl)?(fd|fq|hd|hq)", "(VFMA|VFMS)(fd|fq|hd|hq)")>;
1193
1194// ASIMD FP negate
1195def : InstRW<[A57Write_3cyc_1V], (instregex "VNEG(fd|f32q|hd|hq)")>;
1196
1197// ASIMD FP round to integral
1198def : InstRW<[A57Write_5cyc_1V], (instregex
1199  "VRINT(AN|MN|NN|PN|XN|ZN)(Df|Qf|Dh|Qh)")>;
1200
1201// --- 3.16 ASIMD Miscellaneous Instructions ---
1202
1203// ASIMD bitwise insert
1204def : InstRW<[A57Write_3cyc_1V], (instregex "VBIF", "VBIT", "VBSL")>;
1205
1206// ASIMD count
1207def : InstRW<[A57Write_3cyc_1V], (instregex "VCLS", "VCLZ", "VCNT")>;
1208
1209// ASIMD duplicate, core reg: 8cyc "L, F0/F1"
1210def : InstRW<[A57Write_8cyc_1L_1V], (instregex "VDUP(8|16|32)(d|q)")>;
1211
1212// ASIMD duplicate, scalar: 3cyc "F0/F1"
1213def : InstRW<[A57Write_3cyc_1V], (instregex "VDUPLN(8|16|32)(d|q)")>;
1214
1215// ASIMD extract
1216def : InstRW<[A57Write_3cyc_1V], (instregex "VEXT(d|q)(8|16|32|64)")>;
1217
1218// ASIMD move, immed
1219def : InstRW<[A57Write_3cyc_1V], (instregex
1220  "VMOV(v8i8|v16i8|v4i16|v8i16|v2i32|v4i32|v1i64|v2i64|v2f32|v4f32)",
1221  "VMOVD0", "VMOVQ0")>;
1222
1223// ASIMD move, narrowing
1224def : InstRW<[A57Write_3cyc_1V], (instregex "VMOVN")>;
1225
1226// ASIMD move, saturating
1227def : InstRW<[A57Write_4cyc_1X], (instregex "VQMOVN")>;
1228
1229// ASIMD reciprocal estimate
1230def : InstRW<[A57Write_5cyc_1V], (instregex "VRECPE", "VRSQRTE")>;
1231
1232// ASIMD reciprocal step, FZ
1233def : InstRW<[A57Write_9cyc_1V], (instregex "VRECPS", "VRSQRTS")>;
1234
1235// ASIMD reverse, swap, table lookup (1-2 reg)
1236def : InstRW<[A57Write_3cyc_1V], (instregex "VREV", "VSWP", "VTB(L|X)(1|2)")>;
1237
1238// ASIMD table lookup (3-4 reg)
1239def : InstRW<[A57Write_6cyc_1V], (instregex "VTBL(3|4)", "VTBX(3|4)")>;
1240
1241// ASIMD transfer, scalar to core reg: 6cyc "L, I0/I1"
1242def : InstRW<[A57Write_6cyc_1L_1I], (instregex "VGETLN")>;
1243
1244// ASIMD transfer, core reg to scalar: 8cyc "L, F0/F1"
1245def : InstRW<[A57Write_8cyc_1L_1V], (instregex "VSETLN")>;
1246
1247// ASIMD transpose
1248def : InstRW<[A57Write_3cyc_1V, A57Write_3cyc_1V], (instregex "VTRN")>;
1249
1250// ASIMD unzip/zip, D-form
1251def : InstRW<[A57Write_3cyc_1V, A57Write_3cyc_1V],
1252  (instregex "VUZPd", "VZIPd")>;
1253
1254// ASIMD unzip/zip, Q-form
1255def : InstRW<[A57Write_6cyc_1V, A57Write_6cyc_1V],
1256  (instregex "VUZPq", "VZIPq")>;
1257
1258// --- 3.17 ASIMD Load Instructions ---
1259
1260// Overriden via InstRW for this processor.
1261def : WriteRes<WriteVLD1, []>;
1262def : WriteRes<WriteVLD2, []>;
1263def : WriteRes<WriteVLD3, []>;
1264def : WriteRes<WriteVLD4, []>;
1265def : WriteRes<WriteVST1, []>;
1266def : WriteRes<WriteVST2, []>;
1267def : WriteRes<WriteVST3, []>;
1268def : WriteRes<WriteVST4, []>;
1269
1270// 1-2 reg: 5cyc L, +I for writeback, 1 cyc wb latency
1271def : InstRW<[A57Write_5cyc_1L], (instregex "VLD1(d|q)(8|16|32|64)$")>;
1272def : InstRW<[A57Write_5cyc_1L_1I, A57WrBackOne],
1273  (instregex "VLD1(d|q)(8|16|32|64)wb")>;
1274
1275// 3-4 reg: 6cyc L, +I for writeback, 1 cyc wb latency
1276def : InstRW<[A57Write_6cyc_1L],
1277  (instregex "VLD1(d|q)(8|16|32|64)(T|Q)$", "VLD1d64(T|Q)Pseudo")>;
1278
1279def : InstRW<[A57Write_6cyc_1L_1I, A57WrBackOne],
1280  (instregex "VLD1(d|q)(8|16|32|64)(T|Q)wb")>;
1281
1282// ASIMD load, 1 element, one lane and all lanes: 8cyc "L, F0/F1"
1283def : InstRW<[A57Write_8cyc_1L_1V], (instregex
1284  "VLD1(LN|DUP)(d|q)(8|16|32)$", "VLD1(LN|DUP)(d|q)(8|16|32)Pseudo$")>;
1285def : InstRW<[A57Write_8cyc_1L_1V_1I, A57WrBackOne], (instregex
1286  "VLD1(LN|DUP)(d|q)(8|16|32)(wb|_UPD)", "VLD1LNq(8|16|32)Pseudo_UPD")>;
1287
1288// ASIMD load, 2 element, multiple, 2 reg: 8cyc "L, F0/F1"
1289def : InstRW<[A57Write_8cyc_1L_1V],
1290      (instregex "VLD2(d|q)(8|16|32)$", "VLD2q(8|16|32)Pseudo$")>;
1291def : InstRW<[A57Write_8cyc_1L_1V_1I, A57WrBackOne],
1292      (instregex "VLD2(d|q)(8|16|32)wb", "VLD2q(8|16|32)PseudoWB")>;
1293
1294// ASIMD load, 2 element, multiple, 4 reg: 9cyc "L, F0/F1"
1295def : InstRW<[A57Write_9cyc_1L_1V], (instregex "VLD2b(8|16|32)$")>;
1296def : InstRW<[A57Write_9cyc_1L_1V_1I, A57WrBackOne],
1297      (instregex "VLD2b(8|16|32)wb")>;
1298
1299// ASIMD load, 2 element, one lane and all lanes: 8cyc "L, F0/F1"
1300def : InstRW<[A57Write_8cyc_1L_1V, A57Write_8cyc_1L_1V],
1301      (instregex "VLD2(DUP|LN)(d|q)(8|16|32|8x2|16x2|32x2)$",
1302                 "VLD2LN(d|q)(8|16|32)Pseudo$")>;
1303// 2 results + wb result
1304def : InstRW<[A57Write_8cyc_1L_1V_1I, A57Write_8cyc_1L_1V, A57WrBackOne],
1305      (instregex "VLD2LN(d|q)(8|16|32)_UPD$")>;
1306// 1 result + wb result
1307def : InstRW<[A57Write_8cyc_1L_1V_1I, A57WrBackOne],
1308      (instregex "VLD2DUPd(8|16|32|8x2|16x2|32x2)wb",
1309                 "VLD2LN(d|q)(8|16|32)Pseudo_UPD")>;
1310
1311// ASIMD load, 3 element, multiple, 3 reg: 9cyc "L, F0/F1"
1312// 3 results
1313def : InstRW<[A57Write_9cyc_1L_1V, A57Write_9cyc_1L_1V, A57Write_9cyc_1L_1V],
1314      (instregex "VLD3(d|q)(8|16|32)$")>;
1315// 1 result
1316def : InstRW<[A57Write_9cyc_1L_1V],
1317      (instregex "VLD3(d|q)(8|16|32)(oddP|P)seudo$")>;
1318// 3 results + wb
1319def : InstRW<[A57Write_9cyc_1L_1V_1I, A57Write_9cyc_1L_1V_1I,
1320              A57Write_9cyc_1L_1V_1I, A57WrBackOne],
1321      (instregex "VLD3(d|q)(8|16|32)_UPD$")>;
1322// 1 result + wb
1323def : InstRW<[A57Write_9cyc_1L_1V_1I, A57WrBackOne],
1324      (instregex "VLD3(d|q)(8|16|32)(oddP|P)seudo_UPD")>;
1325
1326// ASIMD load, 3 element, one lane, size 32: 8cyc "L, F0/F1"
1327def : InstRW<[A57Write_8cyc_1L_1V, A57Write_8cyc_1L_1V, A57Write_8cyc_1L_1V],
1328      (instregex "VLD3LN(d|q)32$",
1329                 "VLD3LN(d|q)32Pseudo$")>;
1330def : InstRW<[A57Write_8cyc_1L_1V_1I, A57Write_8cyc_1L_1V_1I,
1331              A57Write_8cyc_1L_1V_1I, A57WrBackOne],
1332      (instregex "VLD3LN(d|q)32_UPD")>;
1333def : InstRW<[A57Write_8cyc_1L_1V_1I, A57WrBackOne],
1334      (instregex "VLD3LN(d|q)32Pseudo_UPD")>;
1335
1336// ASIMD load, 3 element, one lane, size 8/16: 9cyc "L, F0/F1"
1337def : InstRW<[A57Write_9cyc_1L_1V, A57Write_9cyc_1L_1V, A57Write_9cyc_1L_1V],
1338      (instregex "VLD3LN(d|q)(8|16)$",
1339                 "VLD3LN(d|q)(8|16)Pseudo$")>;
1340def : InstRW<[A57Write_9cyc_1L_1V_1I, A57Write_9cyc_1L_1V_1I,
1341              A57Write_9cyc_1L_1V_1I, A57WrBackOne],
1342      (instregex "VLD3LN(d|q)(8|16)_UPD")>;
1343def : InstRW<[A57Write_9cyc_1L_1V_1I, A57WrBackOne],
1344      (instregex "VLD3LN(d|q)(8|16)Pseudo_UPD")>;
1345
1346// ASIMD load, 3 element, all lanes: 8cyc "L, F0/F1"
1347def : InstRW<[A57Write_8cyc_1L_1V, A57Write_8cyc_1L_1V, A57Write_8cyc_1L_1V],
1348      (instregex "VLD3DUP(d|q)(8|16|32)$",
1349                 "VLD3DUP(d|q)(8|16|32)Pseudo$")>;
1350def : InstRW<[A57Write_8cyc_1L_1V_1I, A57Write_8cyc_1L_1V_1I,
1351              A57Write_8cyc_1L_1V_1I, A57WrBackOne],
1352      (instregex "VLD3DUP(d|q)(8|16|32)_UPD")>;
1353def : InstRW<[A57Write_8cyc_1L_1V_1I, A57WrBackOne],
1354      (instregex "VLD3DUP(d|q)(8|16|32)Pseudo_UPD")>;
1355
1356// ASIMD load, 4 element, multiple, 4 reg: 9cyc "L, F0/F1"
1357def : InstRW<[A57Write_9cyc_1L_1V, A57Write_9cyc_1L_1V, A57Write_9cyc_1L_1V,
1358              A57Write_9cyc_1L_1V],
1359      (instregex "VLD4(d|q)(8|16|32)$")>;
1360def : InstRW<[A57Write_9cyc_1L_1V],
1361      (instregex "VLD4(d|q)(8|16|32)(oddP|P)seudo$")>;
1362def : InstRW<[A57Write_9cyc_1L_1V_1I, A57Write_9cyc_1L_1V_1I,
1363              A57Write_9cyc_1L_1V_1I, A57Write_9cyc_1L_1V_1I, A57WrBackOne],
1364      (instregex "VLD4(d|q)(8|16|32)_UPD")>;
1365def : InstRW<[A57Write_9cyc_1L_1V_1I, A57WrBackOne],
1366      (instregex  "VLD4(d|q)(8|16|32)(oddP|P)seudo_UPD")>;
1367
1368// ASIMD load, 4 element, one lane, size 32: 8cyc "L, F0/F1"
1369def : InstRW<[A57Write_8cyc_1L_1V, A57Write_8cyc_1L_1V, A57Write_8cyc_1L_1V,
1370              A57Write_8cyc_1L_1V],
1371      (instregex "VLD4LN(d|q)32$",
1372                 "VLD4LN(d|q)32Pseudo$")>;
1373def : InstRW<[A57Write_8cyc_1L_1V_1I, A57Write_8cyc_1L_1V_1I,
1374              A57Write_8cyc_1L_1V_1I, A57Write_8cyc_1L_1V_1I,
1375              A57WrBackOne],
1376      (instregex "VLD4LN(d|q)32_UPD")>;
1377def : InstRW<[A57Write_8cyc_1L_1V_1I, A57WrBackOne],
1378      (instregex "VLD4LN(d|q)32Pseudo_UPD")>;
1379
1380// ASIMD load, 4 element, one lane, size 8/16: 9cyc "L, F0/F1"
1381def : InstRW<[A57Write_9cyc_1L_1V, A57Write_9cyc_1L_1V, A57Write_9cyc_1L_1V,
1382              A57Write_9cyc_1L_1V],
1383      (instregex "VLD4LN(d|q)(8|16)$",
1384                 "VLD4LN(d|q)(8|16)Pseudo$")>;
1385def : InstRW<[A57Write_9cyc_1L_1V_1I, A57Write_9cyc_1L_1V_1I,
1386              A57Write_9cyc_1L_1V_1I, A57Write_9cyc_1L_1V_1I,
1387              A57WrBackOne],
1388      (instregex "VLD4LN(d|q)(8|16)_UPD")>;
1389def : InstRW<[A57Write_9cyc_1L_1V_1I, A57WrBackOne],
1390      (instregex "VLD4LN(d|q)(8|16)Pseudo_UPD")>;
1391
1392// ASIMD load, 4 element, all lanes: 8cyc "L, F0/F1"
1393def : InstRW<[A57Write_8cyc_1L_1V, A57Write_8cyc_1L_1V, A57Write_8cyc_1L_1V,
1394              A57Write_8cyc_1L_1V],
1395      (instregex "VLD4DUP(d|q)(8|16|32)$",
1396                 "VLD4DUP(d|q)(8|16|32)Pseudo$")>;
1397def : InstRW<[A57Write_8cyc_1L_1V_1I, A57Write_8cyc_1L_1V_1I,
1398              A57Write_8cyc_1L_1V_1I, A57Write_8cyc_1L_1V_1I,
1399              A57WrBackOne],
1400      (instregex "VLD4DUP(d|q)(8|16|32)_UPD")>;
1401def : InstRW<[A57Write_8cyc_1L_1V_1I, A57WrBackOne],
1402      (instregex "VLD4DUP(d|q)(8|16|32)Pseudo_UPD")>;
1403
1404// --- 3.18 ASIMD Store Instructions ---
1405
1406// ASIMD store, 1 element, multiple, 1 reg: 1cyc S
1407def : InstRW<[A57Write_1cyc_1S], (instregex "VST1d(8|16|32|64)$")>;
1408def : InstRW<[A57WrBackOne, A57Write_1cyc_1S_1I],
1409      (instregex "VST1d(8|16|32|64)wb")>;
1410// ASIMD store, 1 element, multiple, 2 reg: 2cyc S
1411def : InstRW<[A57Write_2cyc_1S], (instregex "VST1q(8|16|32|64)$")>;
1412def : InstRW<[A57WrBackOne, A57Write_2cyc_1S_1I],
1413      (instregex "VST1q(8|16|32|64)wb")>;
1414// ASIMD store, 1 element, multiple, 3 reg: 3cyc S
1415def : InstRW<[A57Write_3cyc_1S],
1416      (instregex "VST1d(8|16|32|64)T$", "VST1d64TPseudo$")>;
1417def : InstRW<[A57WrBackOne, A57Write_3cyc_1S_1I],
1418      (instregex "VST1d(8|16|32|64)Twb", "VST1d64TPseudoWB")>;
1419// ASIMD store, 1 element, multiple, 4 reg: 4cyc S
1420def : InstRW<[A57Write_4cyc_1S],
1421      (instregex "VST1d(8|16|32|64)(Q|QPseudo)$")>;
1422def : InstRW<[A57WrBackOne, A57Write_4cyc_1S_1I],
1423      (instregex "VST1d(8|16|32|64)(Qwb|QPseudoWB)")>;
1424// ASIMD store, 1 element, one lane: 3cyc "F0/F1, S"
1425def : InstRW<[A57Write_3cyc_1S_1V],
1426      (instregex "VST1LNd(8|16|32)$", "VST1LNq(8|16|32)Pseudo$")>;
1427def : InstRW<[A57WrBackOne, A57Write_3cyc_1S_1V_1I],
1428      (instregex "VST1LNd(8|16|32)_UPD", "VST1LNq(8|16|32)Pseudo_UPD")>;
1429// ASIMD store, 2 element, multiple, 2 reg: 3cyc "F0/F1, S"
1430def : InstRW<[A57Write_3cyc_1S_1V],
1431      (instregex "VST2(d|b)(8|16|32)$")>;
1432def : InstRW<[A57WrBackOne, A57Write_3cyc_1S_1V_1I],
1433      (instregex "VST2(b|d)(8|16|32)wb")>;
1434// ASIMD store, 2 element, multiple, 4 reg: 4cyc "F0/F1, S"
1435def : InstRW<[A57Write_4cyc_1S_1V],
1436      (instregex "VST2q(8|16|32)$", "VST2q(8|16|32)Pseudo$")>;
1437def : InstRW<[A57WrBackOne, A57Write_4cyc_1S_1V_1I],
1438      (instregex "VST2q(8|16|32)wb", "VST2q(8|16|32)PseudoWB")>;
1439// ASIMD store, 2 element, one lane: 3cyc "F0/F1, S"
1440def : InstRW<[A57Write_3cyc_1S_1V],
1441      (instregex "VST2LN(d|q)(8|16|32)$", "VST2LN(d|q)(8|16|32)Pseudo$")>;
1442def : InstRW<[A57WrBackOne, A57Write_3cyc_1S_1V_1I],
1443      (instregex "VST2LN(d|q)(8|16|32)_UPD",
1444                 "VST2LN(d|q)(8|16|32)Pseudo_UPD")>;
1445// ASIMD store, 3 element, multiple, 3 reg
1446def : InstRW<[A57Write_3cyc_1S_1V],
1447      (instregex "VST3(d|q)(8|16|32)$", "VST3(d|q)(8|16|32)(oddP|P)seudo$")>;
1448def : InstRW<[A57WrBackOne, A57Write_3cyc_1S_1V_1I],
1449      (instregex "VST3(d|q)(8|16|32)_UPD",
1450                 "VST3(d|q)(8|16|32)(oddP|P)seudo_UPD$")>;
1451// ASIMD store, 3 element, one lane
1452def : InstRW<[A57Write_3cyc_1S_1V],
1453      (instregex "VST3LN(d|q)(8|16|32)$", "VST3LN(d|q)(8|16|32)Pseudo$")>;
1454def : InstRW<[A57WrBackOne, A57Write_3cyc_1S_1V_1I],
1455      (instregex "VST3LN(d|q)(8|16|32)_UPD",
1456                 "VST3LN(d|q)(8|16|32)Pseudo_UPD")>;
1457// ASIMD store, 4 element, multiple, 4 reg
1458def : InstRW<[A57Write_4cyc_1S_1V],
1459      (instregex "VST4(d|q)(8|16|32)$", "VST4(d|q)(8|16|32)(oddP|P)seudo$")>;
1460def : InstRW<[A57WrBackOne, A57Write_4cyc_1S_1V_1I],
1461      (instregex "VST4(d|q)(8|16|32)_UPD",
1462                 "VST4(d|q)(8|16|32)(oddP|P)seudo_UPD$")>;
1463// ASIMD store, 4 element, one lane
1464def : InstRW<[A57Write_3cyc_1S_1V],
1465      (instregex "VST4LN(d|q)(8|16|32)$", "VST4LN(d|q)(8|16|32)Pseudo$")>;
1466def : InstRW<[A57WrBackOne, A57Write_3cyc_1S_1V_1I],
1467      (instregex "VST4LN(d|q)(8|16|32)_UPD",
1468                 "VST4LN(d|q)(8|16|32)Pseudo_UPD")>;
1469
1470// --- 3.19 Cryptography Extensions ---
1471// Crypto AES ops
1472// AESD, AESE, AESIMC, AESMC: 3cyc F0
1473def : InstRW<[A57Write_3cyc_1W], (instregex "^AES")>;
1474// Crypto polynomial (64x64) multiply long (VMULL.P64): 3cyc F0
1475def : InstRW<[A57Write_3cyc_1W], (instregex "^VMULLp64")>;
1476// Crypto SHA1 xor ops: 6cyc F0/F1
1477def : InstRW<[A57Write_6cyc_2V], (instregex "^SHA1SU0")>;
1478// Crypto SHA1 fast ops: 3cyc F0
1479def : InstRW<[A57Write_3cyc_1W], (instregex "^SHA1(H|SU1)")>;
1480// Crypto SHA1 slow ops: 6cyc F0
1481def : InstRW<[A57Write_6cyc_2W], (instregex "^SHA1[CMP]")>;
1482// Crypto SHA256 fast ops: 3cyc F0
1483def : InstRW<[A57Write_3cyc_1W], (instregex "^SHA256SU0")>;
1484// Crypto SHA256 slow ops: 6cyc F0
1485def : InstRW<[A57Write_6cyc_2W], (instregex "^SHA256(H|H2|SU1)")>;
1486
1487// --- 3.20 CRC ---
1488def : InstRW<[A57Write_3cyc_1W], (instregex "^(t2)?CRC32")>;
1489
1490// -----------------------------------------------------------------------------
1491// Common definitions
1492def : WriteRes<WriteNoop, []> { let Latency = 0; let NumMicroOps = 0; }
1493def : SchedAlias<WriteALU, A57Write_1cyc_1I>;
1494
1495def : SchedAlias<WriteBr, A57Write_1cyc_1B>;
1496def : SchedAlias<WriteBrL, A57Write_1cyc_1B_1I>;
1497def : SchedAlias<WriteBrTbl, A57Write_1cyc_1B_1I>;
1498def : SchedAlias<WritePreLd, A57Write_4cyc_1L>;
1499
1500def : SchedAlias<WriteLd, A57Write_4cyc_1L>;
1501def : SchedAlias<WriteST, A57Write_1cyc_1S>;
1502def : ReadAdvance<ReadALU, 0>;
1503
1504} // SchedModel = CortexA57Model
1505
1506