1//=- ARMScheduleM7.td - ARM Cortex-M7 Scheduling Definitions -*- tablegen -*-=//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines the SchedRead/Write data for the ARM Cortex-M7 processor.
10//
11//===----------------------------------------------------------------------===//
12
13def CortexM7Model : SchedMachineModel {
14  let IssueWidth = 2;        // Dual issue for most instructions.
15  let MicroOpBufferSize = 0; // The Cortex-M7 is in-order.
16  let LoadLatency = 2;       // Best case for load-use case.
17  let MispredictPenalty = 4; // Mispredict cost for forward branches is 6,
18                             // but 4 works better
19  let CompleteModel = 0;
20}
21
22let SchedModel = CortexM7Model in {
23
24//===--------------------------------------------------------------------===//
25// The Cortex-M7 has two ALU, two LOAD, a STORE, a MAC, a BRANCH and a VFP
26// pipe. The stages relevant to scheduling are as follows:
27//
28//   EX1: address generation  shifts
29//   EX2: fast load data      ALUs                  FP operation
30//   EX3: slow load data      integer writeback     FP operation
31//   EX4: store data                                FP writeback
32//
33// There are shifters in both EX1 and EX2, and some instructions can be
34// flexibly allocated between them.  EX2 is used as the "zero" point
35// for scheduling, so simple ALU operations executing in EX2 will have
36// ReadAdvance<0> (the default) for their source operands and Latency = 1.
37
38def M7UnitLoadL  : ProcResource<1> { let BufferSize = 0; }
39def M7UnitLoadH  : ProcResource<1> { let BufferSize = 0; }
40def M7UnitLoad   : ProcResGroup<[M7UnitLoadL,M7UnitLoadH]> { let BufferSize = 0; }
41def M7UnitStore  : ProcResource<1> { let BufferSize = 0; }
42def M7UnitALU    : ProcResource<2>;
43def M7UnitShift1 : ProcResource<1> { let BufferSize = 0; }
44def M7UnitShift2 : ProcResource<1> { let BufferSize = 0; }
45def M7UnitMAC    : ProcResource<1> { let BufferSize = 0; }
46def M7UnitBranch : ProcResource<1> { let BufferSize = 0; }
47def M7UnitVFP    : ProcResource<1> { let BufferSize = 0; }
48def M7UnitVPortL : ProcResource<1> { let BufferSize = 0; }
49def M7UnitVPortH : ProcResource<1> { let BufferSize = 0; }
50def M7UnitVPort  : ProcResGroup<[M7UnitVPortL,M7UnitVPortH]> { let BufferSize = 0; }
51def M7UnitSIMD   : ProcResource<1> { let BufferSize = 0; }
52
53//===---------------------------------------------------------------------===//
54// Subtarget-specific SchedWrite types with map ProcResources and set latency.
55
56def : WriteRes<WriteALU, [M7UnitALU]> { let Latency = 1; }
57
58// Basic ALU with shifts.
59let Latency = 1 in {
60  def : WriteRes<WriteALUsi,  [M7UnitALU, M7UnitShift1]>;
61  def : WriteRes<WriteALUsr,  [M7UnitALU, M7UnitShift1]>;
62  def : WriteRes<WriteALUSsr, [M7UnitALU, M7UnitShift1]>;
63}
64
65// Compares.
66def : WriteRes<WriteCMP,   [M7UnitALU]> { let Latency = 1; }
67def : WriteRes<WriteCMPsi, [M7UnitALU, M7UnitShift1]> { let Latency = 2; }
68def : WriteRes<WriteCMPsr, [M7UnitALU, M7UnitShift1]> { let Latency = 2; }
69
70// Multiplies.
71let Latency = 2 in {
72  def : WriteRes<WriteMUL16,   [M7UnitMAC]>;
73  def : WriteRes<WriteMUL32,   [M7UnitMAC]>;
74  def : WriteRes<WriteMUL64Lo, [M7UnitMAC]>;
75  def : WriteRes<WriteMUL64Hi, []> { let NumMicroOps = 0; }
76}
77
78// Multiply-accumulates.
79let Latency = 2 in {
80  def : WriteRes<WriteMAC16,   [M7UnitMAC]>;
81  def : WriteRes<WriteMAC32,   [M7UnitMAC]>;
82  def : WriteRes<WriteMAC64Lo, [M7UnitMAC]> { let Latency = 2; }
83  def : WriteRes<WriteMAC64Hi, []> { let NumMicroOps = 0; }
84}
85
86// Divisions.
87// These cannot be dual-issued with any instructions.
88def : WriteRes<WriteDIV, [M7UnitALU]> {
89  let Latency = 7;
90  let SingleIssue = 1;
91}
92
93// Loads/Stores.
94def : WriteRes<WriteLd,    [M7UnitLoad]> { let Latency = 1; }
95def : WriteRes<WritePreLd, [M7UnitLoad]> { let Latency = 2; }
96def : WriteRes<WriteST,    [M7UnitStore]> { let Latency = 2; }
97
98// Branches.
99def : WriteRes<WriteBr,    [M7UnitBranch]> { let Latency = 2; }
100def : WriteRes<WriteBrL,   [M7UnitBranch]> { let Latency = 2; }
101def : WriteRes<WriteBrTbl, [M7UnitBranch]> { let Latency = 2; }
102
103// Noop.
104def : WriteRes<WriteNoop, []> { let Latency = 0; }
105
106//===---------------------------------------------------------------------===//
107// Sched definitions for floating-point instructions
108//
109// Floating point conversions.
110def : WriteRes<WriteFPCVT, [M7UnitVFP, M7UnitVPort]> { let Latency = 3; }
111def : WriteRes<WriteFPMOV, [M7UnitVPort]>            { let Latency = 3; }
112def M7WriteFPMOV64 : SchedWriteRes<[M7UnitVPortL, M7UnitVPortH]> {
113  let Latency = 3;
114}
115
116// The FP pipeline has a latency of 3 cycles.
117// ALU operations (32/64-bit).  These go down the FP pipeline.
118def : WriteRes<WriteFPALU32, [M7UnitVFP, M7UnitVPort]>  { let Latency = 3; }
119def : WriteRes<WriteFPALU64, [M7UnitVFP, M7UnitVPortL, M7UnitVPortH]> {
120  let Latency = 4;
121  let BeginGroup = 1;
122}
123
124// Multiplication
125def : WriteRes<WriteFPMUL32, [M7UnitVFP, M7UnitVPort]> { let Latency = 3; }
126def : WriteRes<WriteFPMUL64, [M7UnitVFP, M7UnitVPortL, M7UnitVPortH]> {
127  let Latency = 7;
128  let BeginGroup = 1;
129}
130
131// Multiply-accumulate.  FPMAC goes down the FP Pipeline.
132def : WriteRes<WriteFPMAC32, [M7UnitVFP, M7UnitVPort]> { let Latency = 6; }
133def : WriteRes<WriteFPMAC64, [M7UnitVFP, M7UnitVPortL, M7UnitVPortH]> {
134  let Latency = 11;
135  let BeginGroup = 1;
136}
137
138// Division.   Effective scheduling latency is 3, though real latency is larger
139def : WriteRes<WriteFPDIV32, [M7UnitVFP, M7UnitVPort]>  { let Latency = 16; }
140def : WriteRes<WriteFPDIV64, [M7UnitVFP, M7UnitVPortL, M7UnitVPortH]> {
141  let Latency = 30;
142  let BeginGroup = 1;
143}
144
145// Square-root.  Effective scheduling latency is 3; real latency is larger
146def : WriteRes<WriteFPSQRT32, [M7UnitVFP, M7UnitVPort]> { let Latency = 16; }
147def : WriteRes<WriteFPSQRT64, [M7UnitVFP, M7UnitVPortL, M7UnitVPortH]> {
148  let Latency = 30;
149  let BeginGroup = 1;
150}
151
152def M7WriteShift2   : SchedWriteRes<[M7UnitALU, M7UnitShift2]> {}
153
154// Not used for M7, but needing definitions anyway
155def : WriteRes<WriteVLD1, []>;
156def : WriteRes<WriteVLD2, []>;
157def : WriteRes<WriteVLD3, []>;
158def : WriteRes<WriteVLD4, []>;
159def : WriteRes<WriteVST1, []>;
160def : WriteRes<WriteVST2, []>;
161def : WriteRes<WriteVST3, []>;
162def : WriteRes<WriteVST4, []>;
163
164def M7SingleIssue : SchedWriteRes<[]> {
165  let SingleIssue = 1;
166  let NumMicroOps = 0;
167}
168def M7Slot0Only   : SchedWriteRes<[]> {
169  let BeginGroup = 1;
170  let NumMicroOps = 0;
171}
172
173// What pipeline stage operands need to be ready for depending on
174// where they come from.
175def : ReadAdvance<ReadALUsr, 0>;
176def : ReadAdvance<ReadMUL, 0>;
177def : ReadAdvance<ReadMAC, 1>;
178def : ReadAdvance<ReadALU, 0>;
179def : ReadAdvance<ReadFPMUL, 0>;
180def : ReadAdvance<ReadFPMAC, 3>;
181def M7Read_ISS : SchedReadAdvance<-1>;     // operands needed at EX1
182def M7Read_EX2   : SchedReadAdvance<1>;    // operands needed at EX3
183def M7Read_EX3   : SchedReadAdvance<2>;    // operands needed at EX4
184
185// Non general purpose instructions may not be dual issued. These
186// use both issue units.
187def M7NonGeneralPurpose : SchedWriteRes<[]> {
188  // Assume that these will go down the main ALU pipeline.
189  // In reality, many look likely to stall the whole pipeline.
190  let Latency = 3;
191  let SingleIssue = 1;
192}
193
194// List the non general purpose instructions.
195def : InstRW<[M7NonGeneralPurpose], (instregex "t2MRS", "tSVC", "tBKPT",
196                                     "t2MSR", "t2DMB", "t2DSB", "t2ISB",
197                                     "t2HVC", "t2SMC", "t2UDF", "ERET",
198                                     "tHINT", "t2HINT", "t2CLREX", "BUNDLE")>;
199
200//===---------------------------------------------------------------------===//
201// Sched definitions for load/store
202//
203// Mark whether the loads/stores must be single-issue
204// Address operands are needed earlier
205// Data operands are needed later
206
207def M7BaseUpdate : SchedWriteRes<[]> {
208    let Latency = 0; // Update is bypassable out of EX1
209    let NumMicroOps = 0;
210}
211def M7LoadLatency1 : SchedWriteRes<[]> {
212    let Latency = 1;
213    let NumMicroOps = 0;
214}
215def M7SlowLoad : SchedWriteRes<[M7UnitLoad]>            { let Latency = 2; }
216
217// Byte and half-word loads should have greater latency than other loads.
218// So should load exclusive.
219
220def : InstRW<[M7SlowLoad],
221      (instregex "t2LDR(B|H|SB|SH)pc")>;
222def : InstRW<[M7SlowLoad, M7Read_ISS],
223      (instregex "t2LDR(B|H|SB|SH)T", "t2LDR(B|H|SB|SH)i",
224                 "tLDR(B|H)i")>;
225def : InstRW<[M7SlowLoad, M7Read_ISS, M7Read_ISS],
226      (instregex "t2LDR(B|H|SB|SH)s", "tLDR(B|H)r", "tLDR(SB|SH)")>;
227def : InstRW<[M7SlowLoad, M7BaseUpdate, M7Read_ISS],
228      (instregex "t2LDR(B|H|SB|SH)_(POST|PRE)")>;
229
230// Exclusive loads/stores cannot be dual-issued
231def : InstRW<[WriteLd, M7Slot0Only, M7Read_ISS],
232      (instregex "t2LDREX$")>;
233def : InstRW<[M7SlowLoad, M7Slot0Only, M7Read_ISS],
234      (instregex "t2LDREX(B|H)")>;
235def : InstRW<[WriteST, M7SingleIssue, M7Read_EX2, M7Read_ISS],
236      (instregex "t2STREX(B|H)?$")>;
237
238// Load/store multiples cannot be dual-issued.  Note that default scheduling
239// occurs around read/write times of individual registers in the list; read
240// time for STM cannot be overridden because it is a variadic source operand.
241
242def : InstRW<[WriteLd, M7SingleIssue, M7Read_ISS],
243      (instregex "(t|t2)LDM(DB|IA)$")>;
244def : InstRW<[WriteST, M7SingleIssue, M7Read_ISS],
245      (instregex "(t|t2)STM(DB|IA)$")>;
246def : InstRW<[M7BaseUpdate, WriteLd, M7SingleIssue, M7Read_ISS],
247      (instregex "(t|t2)LDM(DB|IA)_UPD$", "tPOP")>;
248def : InstRW<[M7BaseUpdate, WriteST, M7SingleIssue, M7Read_ISS],
249      (instregex "(t|t2)STM(DB|IA)_UPD$", "tPUSH")>;
250
251// Load/store doubles cannot be dual-issued.
252
253def : InstRW<[M7BaseUpdate, WriteST, M7SingleIssue,
254              M7Read_EX2, M7Read_EX2, M7Read_ISS],
255      (instregex "t2STRD_(PRE|POST)")>;
256def : InstRW<[WriteST, M7SingleIssue, M7Read_EX2, M7Read_EX2, M7Read_ISS],
257      (instregex "t2STRDi")>;
258def : InstRW<[WriteLd, M7LoadLatency1, M7SingleIssue, M7BaseUpdate, M7Read_ISS],
259      (instregex "t2LDRD_(PRE|POST)")>;
260def : InstRW<[WriteLd, M7LoadLatency1, M7SingleIssue, M7Read_ISS],
261      (instregex "t2LDRDi")>;
262
263// Word load / preload
264def : InstRW<[WriteLd],
265      (instregex "t2LDRpc", "t2PL[DI]pci", "tLDRpci")>;
266def : InstRW<[WriteLd, M7Read_ISS],
267      (instregex "t2LDR(i|T)", "t2PL[DI](W)?i", "tLDRi", "tLDRspi")>;
268def : InstRW<[WriteLd, M7Read_ISS, M7Read_ISS],
269      (instregex "t2LDRs", "t2PL[DI](w)?s", "tLDRr")>;
270def : InstRW<[WriteLd, M7BaseUpdate, M7Read_ISS],
271      (instregex "t2LDR_(POST|PRE)")>;
272
273// Stores
274def : InstRW<[M7BaseUpdate, WriteST, M7Read_EX2, M7Read_ISS],
275      (instregex "t2STR(B|H)?_(POST|PRE)")>;
276def : InstRW<[WriteST, M7Read_EX2, M7Read_ISS, M7Read_ISS],
277      (instregex "t2STR(B|H)?s$", "tSTR(B|H)?r$")>;
278def : InstRW<[WriteST, M7Read_EX2, M7Read_ISS],
279      (instregex "t2STR(B|H)?(i|T)", "tSTR(B|H)?i$", "tSTRspi")>;
280
281// TBB/TBH - single-issue only; takes two cycles to issue
282
283def M7TableLoad : SchedWriteRes<[M7UnitLoad]> {
284  let NumMicroOps = 2;
285  let SingleIssue = 1;
286}
287
288def : InstRW<[M7TableLoad, M7Read_ISS, M7Read_ISS], (instregex "t2TB")>;
289
290// VFP loads and stores
291
292def M7LoadSP  : SchedWriteRes<[M7UnitLoad, M7UnitVPort]> { let Latency = 1; }
293def M7LoadDP  : SchedWriteRes<[M7UnitLoadL, M7UnitLoadH, M7UnitVPortL, M7UnitVPortH]> {
294  let Latency = 2;
295  let SingleIssue = 1;
296}
297def M7StoreSP : SchedWriteRes<[M7UnitStore, M7UnitVPort]>;
298def M7StoreDP : SchedWriteRes<[M7UnitStore, M7UnitVPortL, M7UnitVPortH]> {
299  let SingleIssue = 1;
300}
301
302def : InstRW<[M7LoadSP, M7Read_ISS],                 (instregex "VLDR(S|H)$")>;
303def : InstRW<[M7LoadDP, M7Read_ISS],                 (instregex "VLDRD$")>;
304def : InstRW<[M7StoreSP, M7Read_EX3, M7Read_ISS],    (instregex "VSTR(S|H)$")>;
305def : InstRW<[M7StoreDP, M7Read_EX3, M7Read_ISS],    (instregex "VSTRD$")>;
306
307// Load/store multiples cannot be dual-issued.
308
309def : InstRW<[WriteLd, M7SingleIssue, M7Read_ISS],
310      (instregex "VLDM(S|D|Q)(DB|IA)$")>;
311def : InstRW<[WriteST, M7SingleIssue, M7Read_ISS],
312      (instregex "VSTM(S|D|Q)(DB|IA)$")>;
313def : InstRW<[M7BaseUpdate, WriteLd, M7SingleIssue, M7Read_ISS],
314      (instregex "VLDM(S|D|Q)(DB|IA)_UPD$")>;
315def : InstRW<[M7BaseUpdate, WriteST, M7SingleIssue, M7Read_ISS],
316      (instregex "VSTM(S|D|Q)(DB|IA)_UPD$")>;
317
318//===---------------------------------------------------------------------===//
319// Sched definitions for ALU
320//
321
322// Shifted ALU operands are read a cycle early.
323def M7Ex1ReadNoFastBypass : SchedReadAdvance<-1, [WriteLd, M7LoadLatency1]>;
324
325def : InstRW<[WriteALUsi, M7Ex1ReadNoFastBypass, M7Read_ISS],
326             (instregex "t2(ADC|ADDS|ADD|BIC|EOR|ORN|ORR|RSBS|RSB|SBC|SUBS)rs$",
327                        "t2(SUB|CMP|CMNz|TEQ|TST)rs$",
328                        "t2MOVsr(a|l)")>;
329def : InstRW<[WriteALUsi, M7Read_ISS],
330             (instregex "t2MVNs")>;
331
332// Treat pure shift operations (except for RRX) as if they used the EX1
333// shifter but have timing as if they used the EX2 shifter as they usually
334// can choose the EX2 shifter when needed.  Will miss a few dual-issue cases,
335// but the results prove to be better than trying to get them exact.
336
337def : InstRW<[M7WriteShift2, M7Read_ISS], (instregex "t2RRX$")>;
338def : InstRW<[WriteALUsi], (instregex "(t|t2)(LSL|LSR|ASR|ROR)")>;
339
340// Instructions that use the shifter, but have normal timing.
341
342def : InstRW<[WriteALUsi,M7Slot0Only], (instregex "t2(BFC|BFI)$")>;
343
344// Instructions which are slot zero only but otherwise normal.
345
346def : InstRW<[WriteALU, M7Slot0Only], (instregex "t2CLZ")>;
347
348// MAC operations that don't have SchedRW set.
349
350def : InstRW<[WriteMAC32, ReadMUL, ReadMUL, ReadMAC], (instregex "t2SML[AS]D")>;
351
352// Divides are special because they stall for their latency, and so look like a
353// single-cycle as far as scheduling opportunities go.  By putting WriteALU
354// first, we make the operand latency 1, but keep the instruction latency 7.
355
356def : InstRW<[WriteALU, WriteDIV], (instregex "t2(S|U)DIV")>;
357
358// DSP extension operations
359
360def M7WriteSIMD1   : SchedWriteRes<[M7UnitSIMD, M7UnitALU]> {
361  let Latency = 1;
362  let BeginGroup = 1;
363}
364def M7WriteSIMD2   : SchedWriteRes<[M7UnitSIMD, M7UnitALU]> {
365  let Latency = 2;
366  let BeginGroup = 1;
367}
368def M7WriteShSIMD1 : SchedWriteRes<[M7UnitSIMD, M7UnitALU, M7UnitShift1]> {
369  let Latency = 1;
370  let BeginGroup = 1;
371}
372def M7WriteShSIMD0 : SchedWriteRes<[M7UnitSIMD, M7UnitALU, M7UnitShift1]> {
373  let Latency = 0;      // Bypassable out of EX1
374  let BeginGroup = 1;
375}
376def M7WriteShSIMD2 : SchedWriteRes<[M7UnitSIMD, M7UnitALU, M7UnitShift1]> {
377  let Latency = 2;
378  let BeginGroup = 1;
379}
380
381def : InstRW<[M7WriteShSIMD2, M7Read_ISS],
382             (instregex "t2(S|U)SAT")>;
383def : InstRW<[M7WriteSIMD1, ReadALU],
384             (instregex "(t|t2)(S|U)XT(B|H)")>;
385def : InstRW<[M7WriteSIMD1, ReadALU, ReadALU],
386             (instregex "t2(S|SH|U|UH)(ADD16|ADD8|ASX|SAX|SUB16|SUB8)",
387                        "t2SEL")>;
388def : InstRW<[M7WriteSIMD2, ReadALU, ReadALU],
389             (instregex "t2(Q|UQ)(ADD|ASX|SAX|SUB)", "t2USAD8")>;
390def : InstRW<[M7WriteShSIMD2, M7Read_ISS, M7Read_ISS],
391             (instregex "t2QD(ADD|SUB)")>;
392def : InstRW<[M7WriteShSIMD0, M7Read_ISS],
393             (instregex "t2(RBIT|REV)", "tREV")>;
394def : InstRW<[M7WriteShSIMD1, M7Read_ISS],
395             (instregex "t2(SBFX|UBFX)")>;
396def : InstRW<[M7WriteShSIMD1, ReadALU, M7Read_ISS],
397             (instregex "t2PKH(BT|TB)", "t2(S|U)XTA")>;
398def : InstRW<[M7WriteSIMD2, ReadALU, ReadALU, M7Read_EX2],
399             (instregex "t2USADA8")>;
400
401// MSR/MRS
402def : InstRW<[M7NonGeneralPurpose], (instregex "MSR", "MRS")>;
403
404//===---------------------------------------------------------------------===//
405// Sched definitions for FP operations
406//
407
408// Effective scheduling latency is really 3 for nearly all FP operations,
409// even if their true latency is higher.
410def M7WriteVFPLatOverride : SchedWriteRes<[]> {
411  let Latency = 3;
412  let NumMicroOps = 0;
413}
414def M7WriteVFPExtraVPort  : SchedWriteRes<[M7UnitVPort]> {
415  let Latency = 3;
416  let NumMicroOps = 0;
417}
418
419// Instructions which are missing default schedules.
420def : InstRW<[WriteFPALU32],
421             (instregex "V(ABS|CVT.*|NEG|FP_VMAX.*|FP_VMIN.*|RINT.*)S$")>;
422def : InstRW<[M7WriteVFPLatOverride, WriteFPALU64],
423             (instregex "V(ABS|CVT.*|NEG|FP_VMAX.*|FP_VMIN.*|RINT.*)D$")>;
424
425// VCMP
426def M7WriteVCMPS : SchedWriteRes<[M7UnitVFP, M7UnitVPort]> { let Latency = 0; }
427def M7WriteVCMPD : SchedWriteRes<[M7UnitVFP, M7UnitVPort, M7UnitVPort]> {
428  let Latency = 0;
429  let BeginGroup = 1;
430}
431def : InstRW<[M7WriteVCMPS], (instregex "VCMPS$")>;
432def : InstRW<[M7WriteVCMPD], (instregex "VCMPD$")>;
433
434    // VMRS/VMSR
435def M7VMRS : SchedWriteRes<[M7UnitVFP, M7UnitVPort]> { let SingleIssue = 1; }
436def M7VMSR : SchedWriteRes<[M7UnitVFP, M7UnitVPort]> { let SingleIssue = 1; }
437def : InstRW<[M7VMRS], (instregex "FMSTAT")>;
438def : InstRW<[M7VMSR], (instregex "VMSR")>;
439
440// VSEL cannot bypass in its implied $cpsr operand; model as earlier read
441def : InstRW<[WriteFPALU32, M7Slot0Only, ReadALU, ReadALU, M7Read_ISS],
442             (instregex "VSEL.*S$")>;
443def : InstRW<[M7WriteVFPLatOverride, WriteFPALU64, M7Slot0Only,
444              ReadALU, ReadALU, M7Read_ISS],
445             (instregex "VSEL.*D$")>;
446
447// VMOV
448def : InstRW<[WriteFPMOV],
449             (instregex "VMOV(H|S)$", "FCONST(H|S)")>;
450def : InstRW<[WriteFPMOV, M7WriteVFPExtraVPort, M7Slot0Only],
451             (instregex "VMOVD$")>;
452def : InstRW<[WriteFPMOV, M7WriteVFPExtraVPort, M7Slot0Only],
453             (instregex "FCONSTD")>;
454def : InstRW<[WriteFPMOV, M7WriteVFPExtraVPort, M7SingleIssue],
455             (instregex "VMOV(DRR|RRD|RRS|SRR)")>;
456
457// Larger-latency overrides.
458
459def : InstRW<[M7WriteVFPLatOverride, WriteFPDIV32],  (instregex "VDIVS")>;
460def : InstRW<[M7WriteVFPLatOverride, WriteFPDIV64],  (instregex "VDIVD")>;
461def : InstRW<[M7WriteVFPLatOverride, WriteFPSQRT32], (instregex "VSQRTS")>;
462def : InstRW<[M7WriteVFPLatOverride, WriteFPSQRT64], (instregex "VSQRTD")>;
463def : InstRW<[M7WriteVFPLatOverride, WriteFPMUL64],
464             (instregex "V(MUL|NMUL)D")>;
465def : InstRW<[M7WriteVFPLatOverride, WriteFPALU64],
466             (instregex "V(ADD|SUB)D")>;
467
468// Multiply-accumulate.  Chained SP timing is correct; rest need overrides
469// Double-precision chained MAC stalls the pipeline behind it for 3 cycles,
470// making it appear to have 3 cycle latency for scheduling.
471
472def : InstRW<[M7WriteVFPLatOverride, WriteFPMAC64,
473              ReadFPMAC, ReadFPMUL, ReadFPMUL],
474             (instregex "V(N)?ML(A|S)D$")>;
475
476// Single-precision fused MACs look like latency 5 with advance of 2.
477
478def M7WriteVFPLatOverride5 : SchedWriteRes<[]> {
479  let Latency = 5;
480  let NumMicroOps = 0;
481}
482def M7ReadFPMAC2   : SchedReadAdvance<2>;
483
484def : InstRW<[M7WriteVFPLatOverride5, WriteFPMAC32,
485              M7ReadFPMAC2, ReadFPMUL, ReadFPMUL],
486             (instregex "VF(N)?M(A|S)S$")>;
487
488// Double-precision fused MAC stalls the pipeline behind it for 2 cycles, making
489// it appear to have 3 cycle latency for scheduling.
490
491def : InstRW<[M7WriteVFPLatOverride, WriteFPMAC64,
492              ReadFPMAC, ReadFPMUL, ReadFPMUL],
493             (instregex "VF(N)?M(A|S)D$")>;
494
495}  // SchedModel = CortexM7Model
496