1//=- AArch64SchedAmpere1B.td - Ampere-1B scheduling def -----*- tablegen -*-=//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines the machine model for the Ampere Computing Ampere-1B to
10// support instruction scheduling and other instruction cost heuristics.
11//
12//===----------------------------------------------------------------------===//
13
14// The Ampere-1B core is an out-of-order micro-architecture.  The front
15// end has branch prediction, with a 10-cycle recovery time from a
16// mispredicted branch.  Instructions coming out of the front end are
17// decoded into internal micro-ops (uops).
18
19def Ampere1BModel : SchedMachineModel {
20  let IssueWidth            =  12;  // Maximum micro-ops dispatch rate.
21  let MicroOpBufferSize     = 192;  // micro-op re-order buffer size
22  let LoadLatency           =   3;  // Optimistic load latency
23  let MispredictPenalty     =  10;  // Branch mispredict penalty
24  let LoopMicroOpBufferSize =  32;  // Instruction queue size
25  let CompleteModel         =   1;
26
27  list<Predicate> UnsupportedFeatures = !listconcat(SVEUnsupported.F,
28                                                    SMEUnsupported.F,
29                                                    PAUnsupported.F);
30}
31
32let SchedModel = Ampere1BModel in {
33
34//===----------------------------------------------------------------------===//
35// Define each kind of processor resource and number available on Ampere-1B.
36
37def Ampere1BUnitA  : ProcResource<2>;  // integer single-cycle, branch, and flags r/w
38def Ampere1BUnitB  : ProcResource<2>;  // integer single-cycle, and complex shifts
39def Ampere1BUnitBS : ProcResource<1>;  // integer multi-cycle
40def Ampere1BUnitL  : ProcResource<2>;  // load
41def Ampere1BUnitS  : ProcResource<2>;  // store address calculation
42def Ampere1BUnitX  : ProcResource<1>;  // FP and vector operations, and flag write
43def Ampere1BUnitY  : ProcResource<1>;  // FP and vector operations, and crypto
44def Ampere1BUnitZ  : ProcResource<1>;  // FP store data and FP-to-integer moves
45
46def Ampere1BUnitAB : ProcResGroup<[Ampere1BUnitA, Ampere1BUnitB]>;
47def Ampere1BUnitXY : ProcResGroup<[Ampere1BUnitX, Ampere1BUnitY]>;
48
49//===----------------------------------------------------------------------===//
50// Define customized scheduler read/write types specific to the Ampere-1.
51
52def Ampere1BWrite_1cyc_1A : SchedWriteRes<[Ampere1BUnitA]> {
53  let Latency = 1;
54  let NumMicroOps = 1;
55}
56
57def Ampere1BWrite_1cyc_2A : SchedWriteRes<[Ampere1BUnitA, Ampere1BUnitA]> {
58  let Latency = 1;
59  let NumMicroOps = 2;
60}
61
62def Ampere1BWrite_1cyc_1B : SchedWriteRes<[Ampere1BUnitB]> {
63  let Latency = 1;
64  let NumMicroOps = 1;
65}
66
67def Ampere1BWrite_1cyc_1BS : SchedWriteRes<[Ampere1BUnitBS]> {
68  let Latency = 1;
69  let NumMicroOps = 1;
70}
71
72def Ampere1BWrite_1cyc_1BS_1B : SchedWriteRes<[Ampere1BUnitBS, Ampere1BUnitB]> {
73  let Latency = 1;
74  let NumMicroOps = 2;
75}
76
77def Ampere1BWrite_1cyc_1AB : SchedWriteRes<[Ampere1BUnitAB]> {
78  let Latency = 1;
79  let NumMicroOps = 1;
80}
81
82def Ampere1BWrite_1cyc_1AB_1A : SchedWriteRes<[Ampere1BUnitAB, Ampere1BUnitA]> {
83  let Latency = 1;
84  let NumMicroOps = 2;
85}
86
87def Ampere1BWrite_1cyc_1L : SchedWriteRes<[Ampere1BUnitL]> {
88  let Latency = 1;
89  let NumMicroOps = 1;
90}
91
92def Ampere1BWrite_1cyc_1S : SchedWriteRes<[Ampere1BUnitS]> {
93  let Latency = 1;
94  let NumMicroOps = 1;
95}
96
97def Ampere1BWrite_1cyc_2S : SchedWriteRes<[Ampere1BUnitS, Ampere1BUnitS]> {
98  let Latency = 1;
99  let NumMicroOps = 2;
100}
101
102def Ampere1BWrite_2cyc_1Y : SchedWriteRes<[Ampere1BUnitY]> {
103  let Latency = 2;
104  let NumMicroOps = 1;
105}
106
107def Ampere1BWrite_2cyc_2AB : SchedWriteRes<[Ampere1BUnitAB, Ampere1BUnitAB]> {
108  let Latency = 2;
109  let NumMicroOps = 2;
110}
111
112def Ampere1BWrite_2cyc_1B_1AB : SchedWriteRes<[Ampere1BUnitB, Ampere1BUnitAB]> {
113  let Latency = 2;
114  let NumMicroOps = 2;
115}
116
117def Ampere1BWrite_2cyc_1B_1S : SchedWriteRes<[Ampere1BUnitB, Ampere1BUnitS]> {
118  let Latency = 2;
119  let NumMicroOps = 2;
120}
121
122def Ampere1BWrite_2cyc_1B_1S_1AB : SchedWriteRes<[Ampere1BUnitB,
123                                                  Ampere1BUnitS,
124                                                  Ampere1BUnitAB]> {
125  let Latency = 2;
126  let NumMicroOps = 3;
127}
128
129def Ampere1BWrite_2cyc_1S_2Z : SchedWriteRes<[Ampere1BUnitS,
130                                              Ampere1BUnitZ,
131                                              Ampere1BUnitZ]> {
132  let Latency = 2;
133  let NumMicroOps = 3;
134}
135
136def Ampere1BWrite_2cyc_1XY : SchedWriteRes<[Ampere1BUnitXY]> {
137  let Latency = 2;
138  let NumMicroOps = 1;
139}
140
141def Ampere1BWrite_2cyc_1S_1Z : SchedWriteRes<[Ampere1BUnitS, Ampere1BUnitZ]> {
142  let Latency = 2;
143  let NumMicroOps = 2;
144}
145
146def Ampere1BWrite_3cyc_1BS : SchedWriteRes<[Ampere1BUnitBS]> {
147  let Latency = 3;
148  let NumMicroOps = 1;
149}
150
151def Ampere1BWrite_3cyc_1L : SchedWriteRes<[Ampere1BUnitL]> {
152  let Latency = 3;
153  let NumMicroOps = 1;
154}
155
156def Ampere1BWrite_3cyc_1X : SchedWriteRes<[Ampere1BUnitX]> {
157  let Latency = 3;
158  let NumMicroOps = 1;
159}
160
161def Ampere1BWrite_3cyc_1XY : SchedWriteRes<[Ampere1BUnitXY]> {
162  let Latency = 3;
163  let NumMicroOps = 1;
164}
165
166def Ampere1BWrite_3cyc_1Z : SchedWriteRes<[Ampere1BUnitZ]> {
167  let Latency = 3;
168  let NumMicroOps = 1;
169}
170
171def Ampere1BWrite_3cyc_1S_1Z : SchedWriteRes<[Ampere1BUnitS,
172                                              Ampere1BUnitZ]> {
173  let Latency = 3;
174  let NumMicroOps = 2;
175}
176
177def Ampere1BWrite_3cyc_1S_2Z : SchedWriteRes<[Ampere1BUnitS,
178                                              Ampere1BUnitZ, Ampere1BUnitZ]> {
179  let Latency = 3;
180  let NumMicroOps = 3;
181}
182
183def Ampere1BWrite_3cyc_2S_2Z : SchedWriteRes<[Ampere1BUnitS, Ampere1BUnitS,
184                                              Ampere1BUnitZ, Ampere1BUnitZ]> {
185  let Latency = 3;
186  let NumMicroOps = 4;
187}
188
189def Ampere1BWrite_4cyc_1BS_1AB : SchedWriteRes<[Ampere1BUnitBS, Ampere1BUnitAB]> {
190  let Latency = 4;
191  let NumMicroOps = 2;
192}
193
194def Ampere1BWrite_4cyc_1L : SchedWriteRes<[Ampere1BUnitL]> {
195  let Latency = 4;
196  let NumMicroOps = 1;
197}
198
199def Ampere1BWrite_4cyc_2L : SchedWriteRes<[Ampere1BUnitL, Ampere1BUnitL]> {
200  let Latency = 4;
201  let NumMicroOps = 2;
202}
203
204def Ampere1BWrite_4cyc_1L_1B : SchedWriteRes<[Ampere1BUnitL, Ampere1BUnitB]> {
205  let Latency = 4;
206  let NumMicroOps = 2;
207}
208
209def Ampere1BWrite_4cyc_1X : SchedWriteRes<[Ampere1BUnitX]> {
210  let Latency = 4;
211  let NumMicroOps = 1;
212}
213
214def Ampere1BWrite_4cyc_1XY : SchedWriteRes<[Ampere1BUnitXY]> {
215  let Latency = 4;
216  let NumMicroOps = 1;
217}
218
219def Ampere1BWrite_4cyc_2XY : SchedWriteRes<[Ampere1BUnitXY, Ampere1BUnitXY]> {
220  let Latency = 4;
221  let NumMicroOps = 2;
222}
223
224def Ampere1BWrite_5cyc_1BS : SchedWriteRes<[Ampere1BUnitBS]> {
225  let Latency = 5;
226  let NumMicroOps = 1;
227}
228
229def Ampere1BWrite_4cyc_1XY_1S_1Z : SchedWriteRes<[Ampere1BUnitXY,
230                                                  Ampere1BUnitS,
231                                                  Ampere1BUnitZ]> {
232  let Latency = 4;
233  let NumMicroOps = 3;
234}
235
236def Ampere1BWrite_4cyc_3S_3Z : SchedWriteRes<[Ampere1BUnitS, Ampere1BUnitS,
237                                              Ampere1BUnitS, Ampere1BUnitZ,
238                                              Ampere1BUnitZ, Ampere1BUnitZ]> {
239  let Latency = 4;
240  let NumMicroOps = 6;
241}
242
243def Ampere1BWrite_5cyc_4S_4Z : SchedWriteRes<[Ampere1BUnitS, Ampere1BUnitS,
244                                              Ampere1BUnitS, Ampere1BUnitS,
245                                              Ampere1BUnitZ, Ampere1BUnitZ,
246                                              Ampere1BUnitZ, Ampere1BUnitZ]> {
247  let Latency = 5;
248  let NumMicroOps = 8;
249}
250
251def Ampere1BWrite_5cyc_1L_1BS : SchedWriteRes<[Ampere1BUnitL,
252                                               Ampere1BUnitBS]> {
253  let Latency = 5;
254  let NumMicroOps = 2;
255}
256
257def Ampere1BWrite_5cyc_3L : SchedWriteRes<[Ampere1BUnitL,
258                                           Ampere1BUnitL,
259                                           Ampere1BUnitL]> {
260  let Latency = 5;
261  let NumMicroOps = 3;
262}
263
264def Ampere1BWrite_5cyc_4L : SchedWriteRes<[Ampere1BUnitL,
265                                           Ampere1BUnitL,
266                                           Ampere1BUnitL,
267                                           Ampere1BUnitL]> {
268  let Latency = 5;
269  let NumMicroOps = 4;
270}
271
272def Ampere1BWrite_5cyc_1X : SchedWriteRes<[Ampere1BUnitX]> {
273  let Latency = 5;
274  let NumMicroOps = 1;
275}
276
277def Ampere1BWrite_5cyc_2XY_2S_2Z : SchedWriteRes<[Ampere1BUnitXY, Ampere1BUnitXY,
278                                                  Ampere1BUnitS,  Ampere1BUnitS,
279                                                  Ampere1BUnitZ,  Ampere1BUnitZ]> {
280  let Latency = 5;
281  let NumMicroOps = 6;
282}
283
284def Ampere1BWrite_6cyc_1BS_1A : SchedWriteRes<[Ampere1BUnitBS, Ampere1BUnitA]> {
285  let Latency = 6;
286  let NumMicroOps = 2;
287}
288
289def Ampere1BWrite_6cyc_1BS_2A : SchedWriteRes<[Ampere1BUnitBS, Ampere1BUnitA,
290                                                               Ampere1BUnitA]> {
291  let Latency = 6;
292  let NumMicroOps = 3;
293}
294
295def Ampere1BWrite_6cyc_1L_1XY : SchedWriteRes<[Ampere1BUnitL, Ampere1BUnitXY]> {
296  let Latency = 6;
297  let NumMicroOps = 2;
298}
299
300def Ampere1BWrite_6cyc_2L_2XY : SchedWriteRes<[Ampere1BUnitL,  Ampere1BUnitL,
301                                               Ampere1BUnitXY, Ampere1BUnitXY]> {
302  let Latency = 6;
303  let NumMicroOps = 4;
304}
305
306def Ampere1BWrite_6cyc_1X : SchedWriteRes<[Ampere1BUnitX]> {
307  let Latency = 6;
308  let NumMicroOps = 2;
309}
310
311def Ampere1BWrite_6cyc_2XY : SchedWriteRes<[Ampere1BUnitXY, Ampere1BUnitXY]> {
312  let Latency = 6;
313  let NumMicroOps = 2;
314}
315
316def Ampere1BWrite_6cyc_3XY : SchedWriteRes<[Ampere1BUnitXY, Ampere1BUnitXY,
317                                            Ampere1BUnitXY]> {
318  let Latency = 6;
319  let NumMicroOps = 3;
320}
321
322def Ampere1BWrite_6cyc_2XY_2S_2Z : SchedWriteRes<[Ampere1BUnitXY, Ampere1BUnitXY,
323                                                  Ampere1BUnitS,  Ampere1BUnitS,
324                                                  Ampere1BUnitZ,  Ampere1BUnitZ]> {
325  let Latency = 6;
326  let NumMicroOps = 6;
327}
328
329def Ampere1BWrite_6cyc_3XY_3S_3Z : SchedWriteRes<[Ampere1BUnitXY, Ampere1BUnitXY, Ampere1BUnitXY,
330                                                  Ampere1BUnitS,  Ampere1BUnitS,  Ampere1BUnitS,
331                                                  Ampere1BUnitZ,  Ampere1BUnitZ,  Ampere1BUnitZ]> {
332  let Latency = 6;
333  let NumMicroOps = 9;
334}
335
336def Ampere1BWrite_7cyc_1BS_1XY : SchedWriteRes<[Ampere1BUnitBS, Ampere1BUnitXY]> {
337  let Latency = 7;
338  let NumMicroOps = 2;
339}
340
341def Ampere1BWrite_7cyc_1XY_1Z : SchedWriteRes<[Ampere1BUnitXY, Ampere1BUnitZ]> {
342  let Latency = 7;
343  let NumMicroOps = 2;
344}
345
346def Ampere1BWrite_7cyc_1X_1Z : SchedWriteRes<[Ampere1BUnitX, Ampere1BUnitZ]> {
347  let Latency = 7;
348  let NumMicroOps = 2;
349}
350
351def Ampere1BWrite_7cyc_3L_3XY : SchedWriteRes<[Ampere1BUnitL,  Ampere1BUnitL,
352                                               Ampere1BUnitL,  Ampere1BUnitXY,
353                                               Ampere1BUnitXY, Ampere1BUnitXY]> {
354  let Latency = 7;
355  let NumMicroOps = 6;
356}
357
358def Ampere1BWrite_7cyc_4L_4XY : SchedWriteRes<[Ampere1BUnitL,  Ampere1BUnitL,
359                                               Ampere1BUnitL,  Ampere1BUnitL,
360                                               Ampere1BUnitXY, Ampere1BUnitXY,
361                                               Ampere1BUnitXY, Ampere1BUnitXY]> {
362  let Latency = 7;
363  let NumMicroOps = 8;
364}
365
366def Ampere1BWrite_7cyc_4XY_4S_4Z : SchedWriteRes<[Ampere1BUnitXY, Ampere1BUnitXY,
367                                                  Ampere1BUnitXY, Ampere1BUnitXY,
368                                                  Ampere1BUnitS,  Ampere1BUnitS,
369                                                  Ampere1BUnitS,  Ampere1BUnitS,
370                                                  Ampere1BUnitZ,  Ampere1BUnitZ,
371                                                  Ampere1BUnitZ,  Ampere1BUnitZ]> {
372  let Latency = 7;
373  let NumMicroOps = 12;
374}
375
376def Ampere1BWrite_8cyc_1BS_1L : SchedWriteRes<[Ampere1BUnitBS, Ampere1BUnitL]> {
377  let Latency = 8;
378  let NumMicroOps = 2;
379}
380
381def Ampere1BWrite_8cyc_1BS_1XY : SchedWriteRes<[Ampere1BUnitBS, Ampere1BUnitXY]> {
382  let Latency = 8;
383  let NumMicroOps = 2;
384}
385
386def Ampere1BWrite_8cyc_2L_3XY : SchedWriteRes<[Ampere1BUnitL,  Ampere1BUnitL,
387                                               Ampere1BUnitXY, Ampere1BUnitXY,
388                                               Ampere1BUnitXY]> {
389  let Latency = 8;
390  let NumMicroOps = 5;
391}
392
393def Ampere1BWrite_8cyc_3L_3XY : SchedWriteRes<[Ampere1BUnitL,  Ampere1BUnitL,
394                                               Ampere1BUnitL,  Ampere1BUnitXY,
395                                               Ampere1BUnitXY, Ampere1BUnitXY]> {
396  let Latency = 8;
397  let NumMicroOps = 6;
398}
399
400def Ampere1BWrite_8cyc_4L_4XY : SchedWriteRes<[Ampere1BUnitL,  Ampere1BUnitL,
401                                               Ampere1BUnitL,  Ampere1BUnitL,
402                                               Ampere1BUnitXY, Ampere1BUnitXY,
403                                               Ampere1BUnitXY, Ampere1BUnitXY]> {
404  let Latency = 8;
405  let NumMicroOps = 8;
406}
407
408def Ampere1BWrite_8cyc_2XY : SchedWriteRes<[Ampere1BUnitXY, Ampere1BUnitXY]> {
409  let Latency = 8;
410  let NumMicroOps = 2;
411}
412
413def Ampere1BWrite_8cyc_4XY : SchedWriteRes<[Ampere1BUnitXY, Ampere1BUnitXY,
414                                            Ampere1BUnitXY, Ampere1BUnitXY]> {
415  let Latency = 8;
416  let NumMicroOps = 4;
417}
418
419def Ampere1BWrite_9cyc_6XY_4S_4Z : SchedWriteRes<[Ampere1BUnitXY, Ampere1BUnitXY,
420                                                  Ampere1BUnitXY, Ampere1BUnitXY,
421                                                  Ampere1BUnitXY, Ampere1BUnitXY,
422                                                  Ampere1BUnitS,  Ampere1BUnitS,
423                                                  Ampere1BUnitS,  Ampere1BUnitS,
424                                                  Ampere1BUnitZ,  Ampere1BUnitZ,
425                                                  Ampere1BUnitZ,  Ampere1BUnitZ]> {
426  let Latency = 9;
427  let NumMicroOps = 14;
428}
429
430def Ampere1BWrite_9cyc_1A_1BS_1X : SchedWriteRes<[Ampere1BUnitA, Ampere1BUnitBS, Ampere1BUnitX]> {
431  let Latency = 9;
432  let NumMicroOps = 3;
433}
434
435def Ampere1BWrite_9cyc_1A_1BS_1XY : SchedWriteRes<[Ampere1BUnitA, Ampere1BUnitBS, Ampere1BUnitXY]> {
436  let Latency = 9;
437  let NumMicroOps = 3;
438}
439
440def Ampere1BWrite_9cyc_3L_3XY : SchedWriteRes<[Ampere1BUnitL,  Ampere1BUnitL,
441                                               Ampere1BUnitL,  Ampere1BUnitXY,
442                                               Ampere1BUnitXY, Ampere1BUnitXY]> {
443  let Latency = 9;
444  let NumMicroOps = 6;
445}
446
447def Ampere1BWrite_9cyc_1X : SchedWriteRes<[Ampere1BUnitX]> {
448  let Latency = 9;
449  let NumMicroOps = 1;
450}
451
452def Ampere1BWrite_9cyc_3XY : SchedWriteRes<[Ampere1BUnitXY, Ampere1BUnitXY, Ampere1BUnitXY]> {
453  let Latency = 9;
454  let NumMicroOps = 3;
455}
456
457def Ampere1BWrite_10cyc_4L_8XY : SchedWriteRes<[Ampere1BUnitL,  Ampere1BUnitL,
458                                                Ampere1BUnitL,  Ampere1BUnitL,
459                                                Ampere1BUnitXY, Ampere1BUnitXY,
460                                                Ampere1BUnitXY, Ampere1BUnitXY]> {
461  let Latency = 10;
462  let NumMicroOps = 12;
463}
464
465def Ampere1BWrite_11cyc_1BS_2XY : SchedWriteRes<[Ampere1BUnitBS, Ampere1BUnitXY, Ampere1BUnitXY]> {
466  let Latency = 11;
467  let NumMicroOps = 3;
468}
469
470def Ampere1BWrite_11cyc_4L_8XY : SchedWriteRes<[Ampere1BUnitL,  Ampere1BUnitL,
471                                                Ampere1BUnitL,  Ampere1BUnitL,
472                                                Ampere1BUnitXY, Ampere1BUnitXY,
473                                                Ampere1BUnitXY, Ampere1BUnitXY]> {
474  let Latency = 11;
475  let NumMicroOps = 12;
476}
477
478def Ampere1BWrite_12cyc_1X : SchedWriteRes<[Ampere1BUnitX]> {
479  let Latency = 12;
480  let NumMicroOps = 1;
481}
482
483def Ampere1BWrite_13cyc_1BS_1X : SchedWriteRes<[Ampere1BUnitBS, Ampere1BUnitX]> {
484  let Latency = 13;
485  let NumMicroOps = 2;
486}
487
488def Ampere1BWrite_17cyc_1X : SchedWriteRes<[Ampere1BUnitX]> {
489  let Latency = 17;
490  let NumMicroOps = 1;
491}
492
493def Ampere1BWrite_19cyc_2BS_1X : SchedWriteRes<[Ampere1BUnitBS,
494                                                Ampere1BUnitBS,
495                                                Ampere1BUnitX]> {
496  let Latency = 13;
497  let NumMicroOps = 3;
498}
499
500def Ampere1BWrite_19cyc_1X : SchedWriteRes<[Ampere1BUnitX]> {
501  let Latency = 19;
502  let NumMicroOps = 1;
503}
504
505def Ampere1BWrite_21cyc_1X : SchedWriteRes<[Ampere1BUnitX]> {
506  let Latency = 21;
507  let NumMicroOps = 1;
508}
509
510def Ampere1BWrite_33cyc_1X : SchedWriteRes<[Ampere1BUnitX]> {
511  let Latency = 33;
512  let NumMicroOps = 1;
513}
514
515def Ampere1BWrite_39cyc_1X : SchedWriteRes<[Ampere1BUnitX]> {
516  let Latency = 39;
517  let NumMicroOps = 1;
518}
519
520def Ampere1BWrite_63cyc_1X : SchedWriteRes<[Ampere1BUnitX]> {
521  let Latency = 63;
522  let NumMicroOps = 1;
523}
524
525// For basic arithmetic, we have more flexibility for short shifts (LSL shift <= 4),
526// which are a single uop, and for extended registers, which have full flexibility
527// across Unit A or B for both uops.
528def Ampere1BWrite_Arith : SchedWriteVariant<[
529                                SchedVar<RegExtendedPred, [Ampere1BWrite_2cyc_2AB]>,
530                                SchedVar<IsCheapLSL,      [Ampere1BWrite_1cyc_1AB]>,
531                                SchedVar<NoSchedPred,     [Ampere1BWrite_2cyc_1B_1AB]>]>;
532
533def Ampere1BWrite_ArithFlagsetting : SchedWriteVariant<[
534                                SchedVar<RegExtendedPred, [Ampere1BWrite_2cyc_2AB]>,
535                                SchedVar<IsCheapLSL,      [Ampere1BWrite_1cyc_1AB]>,
536                                SchedVar<NoSchedPred,     [Ampere1BWrite_2cyc_1B_1AB]>]>;
537
538//===----------------------------------------------------------------------===//
539// Map the target-defined scheduler read/write resources and latencies for Ampere-1.
540// This provides a coarse model, which is then specialised below.
541
542def : WriteRes<WriteImm,   [Ampere1BUnitAB]>;  // MOVN, MOVZ
543def : WriteRes<WriteI,     [Ampere1BUnitAB]>;  // ALU
544def : WriteRes<WriteISReg, [Ampere1BUnitB, Ampere1BUnitAB]> {
545  let Latency = 2;
546  let NumMicroOps = 2;
547}  // ALU of Shifted-Reg
548def : WriteRes<WriteIEReg, [Ampere1BUnitAB, Ampere1BUnitAB]> {
549  let Latency = 2;
550  let NumMicroOps = 2;
551}  // ALU of Extended-Reg
552def : WriteRes<WriteExtr,  [Ampere1BUnitB]>;  // EXTR shifts a reg pair
553def : WriteRes<WriteIS,    [Ampere1BUnitB]>;  // Shift/Scale
554def : WriteRes<WriteID32,  [Ampere1BUnitBS, Ampere1BUnitX]> {
555  let Latency = 13;
556}  // 32-bit Divide
557def : WriteRes<WriteID64,  [Ampere1BUnitBS, Ampere1BUnitX]> {
558  let Latency = 19;
559}  // 64-bit Divide
560def : WriteRes<WriteIM32,  [Ampere1BUnitBS]> {
561  let Latency = 3;
562}  // 32-bit Multiply
563def : WriteRes<WriteIM64,  [Ampere1BUnitBS, Ampere1BUnitAB]> {
564  let Latency = 3;
565}  // 64-bit Multiply
566def : WriteRes<WriteBr,    [Ampere1BUnitA]>;
567def : WriteRes<WriteBrReg, [Ampere1BUnitA, Ampere1BUnitA]>;
568def : WriteRes<WriteLD,    [Ampere1BUnitL]> {
569  let Latency = 3;
570}  // Load from base addr plus immediate offset
571def : WriteRes<WriteST,    [Ampere1BUnitS]> {
572  let Latency = 1;
573}  // Store to base addr plus immediate offset
574def : WriteRes<WriteSTP,   [Ampere1BUnitS, Ampere1BUnitS]> {
575  let Latency = 1;
576  let NumMicroOps = 1;
577}  // Store a register pair.
578def : WriteRes<WriteAdr,   [Ampere1BUnitAB]>;
579def : WriteRes<WriteLDIdx, [Ampere1BUnitAB, Ampere1BUnitS]> {
580  let Latency = 3;
581  let NumMicroOps = 1;
582}  // Load from a register index (maybe scaled).
583def : WriteRes<WriteSTIdx, [Ampere1BUnitS, Ampere1BUnitS]> {
584  let Latency = 1;
585  let NumMicroOps = 2;
586}  // Store to a register index (maybe scaled).
587def : WriteRes<WriteF,  [Ampere1BUnitXY]> {
588  let Latency = 2;
589}  // General floating-point ops.
590def : WriteRes<WriteFCmp,  [Ampere1BUnitX]> {
591  let Latency = 3;
592}  // Floating-point compare.
593def : WriteRes<WriteFCvt,  [Ampere1BUnitXY]> {
594  let Latency = 3;
595}  // Float conversion.
596def : WriteRes<WriteFCopy, [Ampere1BUnitXY]> {
597}  // Float-int register copy.
598def : WriteRes<WriteFImm,  [Ampere1BUnitXY]> {
599  let Latency = 2;
600}  // Float-int register copy.
601def : WriteRes<WriteFMul,  [Ampere1BUnitXY]> {
602  let Latency = 4;
603}  // Floating-point multiply.
604def : WriteRes<WriteFDiv,  [Ampere1BUnitXY]> {
605  let Latency = 19;
606}  // Floating-point division.
607def : WriteRes<WriteVd,    [Ampere1BUnitXY]> {
608  let Latency = 3;
609}  // 64bit Vector D ops.
610def : WriteRes<WriteVq,    [Ampere1BUnitXY]> {
611  let Latency = 3;
612}  // 128bit Vector Q ops.
613def : WriteRes<WriteVLD,   [Ampere1BUnitL, Ampere1BUnitL]> {
614  let Latency = 4;
615}  // Vector loads.
616def : WriteRes<WriteVST,   [Ampere1BUnitS, Ampere1BUnitZ]> {
617  let Latency = 2;
618}  // Vector stores.
619
620def : WriteRes<WriteAtomic,  []> { let Unsupported = 1; }
621
622def : WriteRes<WriteSys,     []> { let Latency = 1; }
623def : WriteRes<WriteBarrier, []> { let Latency = 1; }
624def : WriteRes<WriteHint,    []> { let Latency = 1; }
625
626def : WriteRes<WriteLDHi,    []> {
627  let Latency = 3;
628}  // The second register of a load-pair: LDP,LDPSW,LDNP,LDXP,LDAXP
629
630// Forwarding logic.
631def : ReadAdvance<ReadI,       0>;
632def : ReadAdvance<ReadISReg,   0>;
633def : ReadAdvance<ReadIEReg,   0>;
634def : ReadAdvance<ReadIM,      0>;
635def : ReadAdvance<ReadIMA,     1, [WriteIM32, WriteIM64]>;
636def : ReadAdvance<ReadID,      0>;
637def : ReadAdvance<ReadExtrHi,  0>;
638def : ReadAdvance<ReadST,      0>;
639def : ReadAdvance<ReadAdrBase, 0>;
640def : ReadAdvance<ReadVLD,     0>;
641
642//===----------------------------------------------------------------------===//
643// Specialising the scheduling model further for Ampere-1B.
644
645def : InstRW<[Ampere1BWrite_1cyc_1AB], (instrs COPY)>;
646
647// Branch instructions
648def : InstRW<[Ampere1BWrite_1cyc_1A], (instrs Bcc, BL, RET)>;
649def : InstRW<[Ampere1BWrite_1cyc_1A],
650        (instrs CBZW, CBZX, CBNZW, CBNZX, TBZW, TBZX, TBNZW, TBNZX)>;
651def : InstRW<[Ampere1BWrite_1cyc_2A], (instrs BLR)>;
652
653// Common Short Sequence Compression (CSSC)
654def : InstRW<[Ampere1BWrite_1cyc_1AB], (instregex "^ABS[WX]")>;
655def : InstRW<[Ampere1BWrite_3cyc_1BS], (instregex "^CNT[WX]")>;
656def : InstRW<[Ampere1BWrite_1cyc_1B], (instregex "^CTZ[WX]")>;
657def : InstRW<[Ampere1BWrite_1cyc_1AB_1A], (instregex "^[SU](MAX|MIN)[WX]")>;
658
659// Cryptography instructions
660// -- AES encryption/decryption
661def : InstRW<[Ampere1BWrite_2cyc_1XY], (instregex "^AES[DE]")>;
662def : InstRW<[Ampere1BWrite_2cyc_1XY], (instregex "^AESI?MC")>;
663// -- Polynomial multiplication
664def : InstRW<[Ampere1BWrite_2cyc_1XY], (instregex "^PMUL", "^PMULL")>;
665// -- SHA-256 hash
666def : InstRW<[Ampere1BWrite_4cyc_1X], (instregex "^SHA256(H|H2)")>;
667// -- SHA-256 schedule update
668def : InstRW<[Ampere1BWrite_2cyc_1Y], (instregex "^SHA256SU[01]")>;
669// -- SHA-3 instructions
670def : InstRW<[Ampere1BWrite_2cyc_1XY],
671        (instregex "^BCAX", "^EOR3", "^RAX1", "^XAR")>;
672// -- SHA-512 hash
673def : InstRW<[Ampere1BWrite_4cyc_1X], (instregex "^SHA512(H|H2)")>;
674// -- SHA-512 schedule update
675def : InstRW<[Ampere1BWrite_2cyc_1Y], (instregex "^SHA512SU[01]")>;
676// -- SHA1 choose/majority/parity
677def : InstRW<[Ampere1BWrite_4cyc_1X], (instregex "^SHA1[CMP]")>;
678// -- SHA1 hash/schedule update
679def : InstRW<[Ampere1BWrite_2cyc_1Y], (instregex "^SHA1SU[01]")>;
680def : InstRW<[Ampere1BWrite_2cyc_1Y], (instregex "^SHA1H")>;
681// -- SM3 hash
682def : InstRW<[Ampere1BWrite_2cyc_1XY],
683    (instregex "^SM3PARTW[12]$", "^SM3SS1$", "^SM3TT[12][AB]$")>;
684def : InstRW<[Ampere1BWrite_4cyc_1X], (instrs SM4E, SM4ENCKEY)>;
685
686// FP and vector load instructions
687// -- Load 1-element structure to one/all lanes
688// ---- all lanes
689def : InstRW<[Ampere1BWrite_6cyc_1L_1XY],
690        (instregex "^LD1Rv(8b|4h|2s|16b|8h|4s|2d)")>;
691// ---- one lane
692def : InstRW<[Ampere1BWrite_6cyc_1L_1XY],
693        (instregex "^LD1i(8|16|32|64)")>;
694// -- Load 1-element structure to one/all lanes, 1D size
695def : InstRW<[Ampere1BWrite_4cyc_1L],
696        (instregex "^LD1Rv1d")>;
697// -- Load 1-element structures to 1 register
698def : InstRW<[Ampere1BWrite_4cyc_1L],
699        (instregex "^LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)")>;
700// -- Load 1-element structures to 2 registers
701def : InstRW<[Ampere1BWrite_4cyc_2L],
702        (instregex "^LD1Twov(8b|4h|2s|1d|16b|8h|4s|2d)")>;
703// -- Load 1-element structures to 3 registers
704def : InstRW<[Ampere1BWrite_5cyc_3L],
705        (instregex "^LD1Threev(8b|4h|2s|1d|16b|8h|4s|2d)")>;
706// -- Load 1-element structures to 4 registers
707def : InstRW<[Ampere1BWrite_5cyc_4L],
708        (instregex "^LD1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)")>;
709// -- Load 2-element structure to all lanes of 2 registers, 1D size
710def : InstRW<[Ampere1BWrite_4cyc_2L],
711        (instregex "^LD2Rv1d")>;
712// -- Load 2-element structure to all lanes of 2 registers, other sizes
713def : InstRW<[Ampere1BWrite_6cyc_2L_2XY],
714        (instregex "^LD2Rv(8b|4h|2s|16b|8h|4s|2d)")>;
715// -- Load 2-element structure to one lane of 2 registers
716def : InstRW<[Ampere1BWrite_6cyc_2L_2XY],
717        (instregex "^LD2i(8|16|32|64)")>;
718// -- Load 2-element structures to 2 registers, 16B/8H/4S/2D size
719def : InstRW<[Ampere1BWrite_6cyc_2L_2XY],
720        (instregex "^LD2Twov(16b|8h|4s|2d)")>;
721// -- Load 2-element structures to 2 registers, 8B/4H/2S size
722def : InstRW<[Ampere1BWrite_8cyc_2L_3XY],
723        (instregex "^LD2Twov(8b|4h|2s)")>;
724// -- Load 3-element structure to all lanes of 3 registers, 1D size
725def : InstRW<[Ampere1BWrite_5cyc_3L],
726        (instregex "^LD3Rv1d")>;
727// -- Load 3-element structure to all lanes of 3 registers, other sizes
728def : InstRW<[Ampere1BWrite_7cyc_3L_3XY],
729        (instregex "^LD3Rv(8b|4h|2s|16b|8h|4s|2d)")>;
730// -- Load 3-element structure to one lane of 3 registers
731def : InstRW<[Ampere1BWrite_7cyc_3L_3XY],
732        (instregex "^LD3i(8|16|32|64)")>;
733// -- Load 3-element structures to 3 registers, 16B/8H/4S sizes
734def : InstRW<[Ampere1BWrite_8cyc_3L_3XY],
735        (instregex "^LD3Threev(16b|8h|4s)")>;
736// -- Load 3-element structures to 3 registers, 2D size
737def : InstRW<[Ampere1BWrite_7cyc_3L_3XY],
738        (instregex "^LD3Threev2d")>;
739// -- Load 3-element structures to 3 registers, 8B/4H/2S sizes
740def : InstRW<[Ampere1BWrite_9cyc_3L_3XY],
741        (instregex "^LD3Threev(8b|4h|2s)")>;
742// -- Load 4-element structure to all lanes of 4 registers, 1D size
743def : InstRW<[Ampere1BWrite_5cyc_4L],
744        (instregex "^LD4Rv1d")>;
745// -- Load 4-element structure to all lanes of 4 registers, other sizes
746def : InstRW<[Ampere1BWrite_7cyc_4L_4XY],
747        (instregex "^LD4Rv(8b|4h|2s|16b|8h|4s|2d)")>;
748// -- Load 4-element structure to one lane of 4 registers
749def : InstRW<[Ampere1BWrite_7cyc_4L_4XY],
750        (instregex "^LD4i(8|16|32|64)")>;
751// -- Load 4-element structures to 4 registers, 2D size
752def : InstRW<[Ampere1BWrite_8cyc_4L_4XY],
753        (instregex "^LD4Fourv2d")>;
754// -- Load 4-element structures to 4 registers, 2S size
755def : InstRW<[Ampere1BWrite_11cyc_4L_8XY],
756        (instregex "^LD4Fourv2s")>;
757// -- Load 4-element structures to 4 registers, other sizes
758def : InstRW<[Ampere1BWrite_10cyc_4L_8XY],
759        (instregex "^LD4Fourv(8b|4h|16b|8h|4s)")>;
760// -- Load pair, Q-form
761def : InstRW<[Ampere1BWrite_4cyc_2L], (instregex "LDN?PQ")>;
762// -- Load pair, S/D-form
763def : InstRW<[Ampere1BWrite_5cyc_1L_1BS], (instregex "LDN?P(S|D)")>;
764// -- Load register
765def : InstRW<[Ampere1BWrite_4cyc_1L], (instregex "LDU?R[BHSDQ]i")>;
766// -- Load register, sign-extended register
767def : InstRW<[Ampere1BWrite_4cyc_1L], (instregex "LDR[BHSDQ]ro(W|X)")>;
768
769// FP and vector store instructions
770// -- Store 1-element structure from one lane of 1 register
771def : InstRW<[Ampere1BWrite_4cyc_1XY_1S_1Z],
772        (instregex "^ST1i(8|16|32|64)")>;
773// -- Store 1-element structures from 1 register
774def : InstRW<[Ampere1BWrite_2cyc_1S_1Z],
775        (instregex "^ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)")>;
776// -- Store 1-element structures from 2 registers
777def : InstRW<[Ampere1BWrite_3cyc_2S_2Z],
778        (instregex "^ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)")>;
779// -- Store 1-element structures from 3 registers
780def : InstRW<[Ampere1BWrite_4cyc_3S_3Z],
781        (instregex "^ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)")>;
782// -- Store 1-element structures from 4 registers
783def : InstRW<[Ampere1BWrite_5cyc_4S_4Z],
784        (instregex "^ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)")>;
785// -- Store 2-element structure from one lane of 2 registers
786def : InstRW<[Ampere1BWrite_5cyc_2XY_2S_2Z],
787        (instregex "^ST2i(8|16|32|64)")>;
788// -- Store 2-element structures from 2 registers, 16B/8H/4S/2D sizes
789def : InstRW<[Ampere1BWrite_5cyc_2XY_2S_2Z],
790        (instregex "^ST2Twov(16b|8h|4s|2d)")>;
791// -- Store 2-element structures from 2 registers, 8B/4H/2S sizes
792def : InstRW<[Ampere1BWrite_6cyc_2XY_2S_2Z],
793        (instregex "^ST2Twov(8b|4h|2s)")>;
794// -- Store 3-element structure from one lane of 3 registers
795def : InstRW<[Ampere1BWrite_6cyc_3XY_3S_3Z],
796        (instregex "^ST3i(8|16|32|64)")>;
797// -- Store 3-element structures from 3 registers
798def : InstRW<[Ampere1BWrite_6cyc_3XY_3S_3Z],
799        (instregex "^ST3Threev(8b|4h|2s|1d|16b|8h|4s|2d)")>;
800// -- Store 4-element structure from one lane of 4 registers
801def : InstRW<[Ampere1BWrite_7cyc_4XY_4S_4Z],
802        (instregex "^ST4i(8|16|32|64)")>;
803// -- Store 4-element structures from 4 registers, 16B/8H/4S sizes
804def : InstRW<[Ampere1BWrite_7cyc_4XY_4S_4Z],
805        (instregex "^ST4Fourv(16b|8h|4s)")>;
806// -- Store 4-element structures from 4 registers, 2D sizes
807def : InstRW<[Ampere1BWrite_7cyc_4XY_4S_4Z],
808        (instregex "^ST4Fourv2d")>;
809// -- Store 4-element structures from 4 registers, 8B/4H/2S sizes
810def : InstRW<[Ampere1BWrite_9cyc_6XY_4S_4Z],
811        (instregex "^ST4Fourv(8b|4h|2s)")>;
812// -- Store pair, Q-form
813def : InstRW<[Ampere1BWrite_3cyc_2S_2Z], (instregex "^STN?PQ")>;
814// -- Store pair, S/D-form
815def : InstRW<[Ampere1BWrite_3cyc_2S_2Z], (instregex "^STN?P[SD]")>;
816// -- Store register
817def : InstRW<[Ampere1BWrite_2cyc_1S_1Z], (instregex "^STU?R[BHSDQ](ui|i)")>;
818// -- Store register, sign-extended register offset
819def : InstRW<[Ampere1BWrite_2cyc_1S_1Z], (instregex "^STR[BHSDQ]ro[XW]")>;
820
821// FP data processing, bfloat16 format
822def : InstRW<[Ampere1BWrite_3cyc_1XY], (instrs BFCVT)>;
823def : InstRW<[Ampere1BWrite_8cyc_2XY], (instrs BFCVTN, BFCVTN2)>;
824def : InstRW<[Ampere1BWrite_2cyc_1XY], (instregex "^BFDOTv", "^BF16DOT")>;
825def : InstRW<[Ampere1BWrite_3cyc_1XY], (instrs BFMMLA)>;
826def : InstRW<[Ampere1BWrite_4cyc_1XY], (instregex "^BFMLAL")>;
827
828// FP data processing, scalar/vector, half precision
829def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^F(ABD|ABS)v.[fi]16")>;
830def : InstRW<[Ampere1BWrite_3cyc_1XY],
831        (instregex "^F(ADD|ADDP|CADD|NEG|NMUL|SUB)v.[fi]16")>;
832def : InstRW<[Ampere1BWrite_3cyc_1XY],
833        (instregex "^F(AC|CM)(EQ|GE|GT|LE|LT)v.[fi]16")>;
834def : InstRW<[Ampere1BWrite_3cyc_1XY],
835        (instregex "^F(AC|CM)(EQ|GE|GT|LE|LT)16")>;
836def : InstRW<[Ampere1BWrite_3cyc_1X],
837        (instregex "^FCMPE?H")>;
838def : InstRW<[Ampere1BWrite_9cyc_1A_1BS_1X],
839        (instregex "^FCCMPE?H")>;
840def : InstRW<[Ampere1BWrite_9cyc_1A_1BS_1XY],
841        (instregex "^FCSELH")>;
842def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^FCVT[AMNPZ][SU]v.[if]16")>;
843// Convert FP to integer, H-form
844def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^[SUd]CVTFv.[fi]16")>;
845// Convert to FP from GPR, H-form
846def : InstRW<[Ampere1BWrite_8cyc_1BS_1XY], (instregex "^[SU]CVTF_ZPmZ_[DSH]toH$")>;
847// Convert to FP from GPR, fixed-point, H-form
848def : InstRW<[Ampere1BWrite_11cyc_1BS_2XY], (instregex "^[SU]CVTF[SU][WX]Hri$")>;
849def : InstRW<[Ampere1BWrite_9cyc_1X], (instrs FDIVHrr)>;
850def : InstRW<[Ampere1BWrite_17cyc_1X], (instregex "^FDIVv.[if]16")>;
851def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^F(MAX|MIN)(NM)?P?v.[if]16")>;
852def : InstRW<[Ampere1BWrite_6cyc_2XY], (instregex "^F(MAX|MIN)(NM)?Vv4[if]16")>;
853def : InstRW<[Ampere1BWrite_9cyc_3XY], (instregex "^F(MAX|MIN)(NM)?Vv8[if]16")>;
854def : InstRW<[Ampere1BWrite_4cyc_1XY], (instregex "^FMULX?v.[if]16")>;
855def : InstRW<[Ampere1BWrite_4cyc_1XY], (instrs FMULX16)>;
856def : InstRW<[Ampere1BWrite_4cyc_1XY], (instregex "^FN?M(ADD|SUB)[H]rrr")>;
857def : InstRW<[Ampere1BWrite_4cyc_1XY], (instregex "^FML[AS]v.[if]16")>;
858def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^FRECPXv.[if]16")>;
859def : InstRW<[Ampere1BWrite_4cyc_1XY], (instregex "^F(RECP|RSQRT)S16")>;
860def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^FRINT[AIMNPXZ]v.[if]16")>;
861// FP square root, H-form
862def : InstRW<[Ampere1BWrite_21cyc_1X], (instrs FSQRTHr)>;
863// FP square root, vector-form, F16
864def : InstRW<[Ampere1BWrite_39cyc_1X], (instregex "^FSQRTv.f16")>;
865
866// FP data processing, scalar/vector, single/double precision
867def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^F(ABD|ABS)v.[fi](32|64)")>;
868def : InstRW<[Ampere1BWrite_3cyc_1XY],
869        (instregex "^F(ADD|ADDP|CADD|NEG|NMUL|SUB)v.[fi](32|64)")>;
870def : InstRW<[Ampere1BWrite_3cyc_1XY],
871        (instregex "^F(AC|CM)(EQ|GE|GT|LE|LT)v.[fi](32|64)")>;
872def : InstRW<[Ampere1BWrite_3cyc_1XY],
873        (instregex "^F(AC|CM)(EQ|GE|GT|LE|LT)(32|64)")>;
874def : InstRW<[Ampere1BWrite_3cyc_1X],
875        (instregex "^FCMPE?(S|D)")>;
876def : InstRW<[Ampere1BWrite_9cyc_1A_1BS_1X],
877        (instregex "^FCCMPE?(S|D)")>;
878def : InstRW<[Ampere1BWrite_9cyc_1A_1BS_1XY],
879        (instregex "^FCSEL(S|D)")>;
880def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^FCVT[AMNPZ][SU]v.[if](32|64)")>;
881// Convert FP to integer, S/D-form
882def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^[SUd]CVTFv.[fi](32|64)")>;
883// Convert to FP from GPR, S/D-form
884def : InstRW<[Ampere1BWrite_8cyc_1BS_1XY], (instregex "^[SU]CVTF_ZPmZ_[DSH]to[DS]$")>;
885// Convert to FP from GPR, fixed-point, S/D-form
886def : InstRW<[Ampere1BWrite_11cyc_1BS_2XY], (instregex "^[SU]CVTF[SU][WX][SD]ri$")>;
887def : InstRW<[Ampere1BWrite_19cyc_1X], (instregex "^FDIVv.[if](64)", "FDIVD")>;
888def : InstRW<[Ampere1BWrite_12cyc_1X], (instregex "^FDIVv.[if](32)", "FDIVS")>;
889def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^F(MAX|MIN)(NM)?P?v.[if](32|64)")>;
890def : InstRW<[Ampere1BWrite_6cyc_2XY], (instregex "^F(MAX|MIN)(NM)?Vv.[if](32|64)")>;
891def : InstRW<[Ampere1BWrite_4cyc_1XY], (instregex "^FMULX?v.[if](32|64)")>;
892def : InstRW<[Ampere1BWrite_4cyc_1XY], (instrs FMULX32, FMULX64)>;
893def : InstRW<[Ampere1BWrite_4cyc_1XY], (instrs FMULSrr, FNMULSrr)>;
894def : InstRW<[Ampere1BWrite_4cyc_1XY], (instrs FMULDrr, FNMULDrr)>;
895def : InstRW<[Ampere1BWrite_4cyc_1XY], (instregex "^FN?M(ADD|SUB)[SD]rrr")>;
896def : InstRW<[Ampere1BWrite_4cyc_1XY], (instregex "^FML[AS]v.[if](32|64)")>;
897def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^FRECPXv.[if](32|64)")>;
898def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^F(RECP|RSQRT)S(32|64)")>;
899def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^FRINT[AIMNPXZ]v.[if](32|64)")>;
900def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^FRINT(32|64)")>;
901def : InstRW<[Ampere1BWrite_63cyc_1X], (instregex "^FSQRTv.f64", "^FSQRTDr")>;
902def : InstRW<[Ampere1BWrite_33cyc_1X], (instregex "^FSQRTv.f32", "^FSQRTSr")>;
903
904// FP miscellaneous instructions
905def : InstRW<[Ampere1BWrite_7cyc_1XY_1Z], (instregex "^FCVT[AMNPZ][SU][SU][XW][HSD]r")>;
906def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^FCVT[HSD]Hr")>;
907def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^FCVT[HSD][SD]r")>;
908def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^FCVTLv")>;
909def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^FCVT(N|XN)v")>;
910def : InstRW<[Ampere1BWrite_7cyc_1X_1Z], (instrs FJCVTZS)>;
911def : InstRW<[Ampere1BWrite_5cyc_1BS], (instregex "^FMOV[HSD][WX]r")>;
912def : InstRW<[Ampere1BWrite_7cyc_1BS_1XY], (instregex "^FMOVDXHighr")>;
913def : InstRW<[Ampere1BWrite_2cyc_1XY], (instregex "^FMOV[HSD][ri]")>;
914def : InstRW<[Ampere1BWrite_5cyc_1X], (instregex "^FMOVXDHighr")>;
915def : InstRW<[Ampere1BWrite_3cyc_1Z], (instregex "^FMOV[WX][HSD]r")>;
916
917// Integer arithmetic and logical instructions
918def : InstRW<[Ampere1BWrite_1cyc_1A],
919        (instregex "ADC(W|X)r", "SBC(W|X)r")>;
920def : InstRW<[Ampere1BWrite_Arith],
921        (instregex "(ADD|AND|BIC|EON|EOR|ORN|ORR|SUB)[WX]r[sx]")>;
922def : InstRW<[Ampere1BWrite_1cyc_1AB],
923        (instregex "(ADD|AND|BIC|EON|EOR|ORN|ORR|SUB)[WX]r[ri]")>;
924def : InstRW<[Ampere1BWrite_ArithFlagsetting],
925        (instregex "(ADD|AND|BIC|SUB)S[WX]r[sx]")>;
926def : InstRW<[Ampere1BWrite_1cyc_1A],
927        (instregex "(ADD|AND|BIC|SUB)S[WX]r[ri]")>;
928def : InstRW<[Ampere1BWrite_1cyc_1A],
929        (instregex "(ADC|SBC)S[WX]r")>;
930def : InstRW<[Ampere1BWrite_1cyc_1A], (instrs RMIF)>;
931def : InstRW<[Ampere1BWrite_1cyc_1A],
932        (instregex "(CCMN|CCMP)(X|W)")>;
933def : InstRW<[Ampere1BWrite_1cyc_1A],
934        (instregex "(CSEL|CSINC|CSINV|CSNEG)(X|W)")>;
935def : InstRW<[Ampere1BWrite_13cyc_1BS_1X], (instrs SDIVWr, UDIVWr)>;
936def : InstRW<[Ampere1BWrite_19cyc_2BS_1X], (instrs SDIVXr, UDIVXr)>;
937def : InstRW<[Ampere1BWrite_3cyc_1BS],
938        (instregex "(S|U)MULHr")>;
939def : InstRW<[Ampere1BWrite_4cyc_1BS_1AB],
940        (instregex "(S|U)?M(ADD|SUB)L?r")>;
941
942// Integer load instructions
943def : InstRW<[Ampere1BWrite_3cyc_1L],
944        (instregex "(LDNP|LDP|LDPSW)(X|W)")>;
945def : InstRW<[Ampere1BWrite_3cyc_1L],
946        (instregex "LDR(B|D|H|Q|S)ui")>;
947def : InstRW<[Ampere1BWrite_3cyc_1L],
948        (instregex "LDR(D|Q|W|X)l")>;
949def : InstRW<[Ampere1BWrite_3cyc_1L],
950        (instregex "LDTR(B|H|W|X)i")>;
951def : InstRW<[Ampere1BWrite_3cyc_1L],
952        (instregex "LDTRS(BW|BX|HW|HX|W)i")>;
953def : InstRW<[Ampere1BWrite_3cyc_1L],
954        (instregex "LDUR(BB|HH|X|W)i")>;
955def : InstRW<[Ampere1BWrite_3cyc_1L],
956        (instregex "LDURS(BW|BX|HW|HX|W)i")>;
957def : InstRW<[Ampere1BWrite_3cyc_1L],
958        (instregex "LDR(HH|SHW|SHX|W|X)ro(W|X)")>;
959def : InstRW<[Ampere1BWrite_1cyc_1L],
960        (instrs PRFMl, PRFUMi, PRFUMi)>;
961def : InstRW<[Ampere1BWrite_1cyc_1L],
962        (instrs PRFMroW, PRFMroX)>;
963
964// Integer miscellaneous instructions
965def : InstRW<[Ampere1BWrite_1cyc_1A],  (instrs ADR, ADRP)>;
966def : InstRW<[Ampere1BWrite_1cyc_1B],  (instregex "EXTR(W|X)")>;
967def : InstRW<[Ampere1BWrite_1cyc_1B],  (instregex "(S|U)?BFM(W|X)")>;
968def : InstRW<[Ampere1BWrite_3cyc_1BS], (instregex "^CRC32C?[BHWX]")>;
969def : InstRW<[Ampere1BWrite_1cyc_1B],  (instregex "CLS(W|X)")>;
970def : InstRW<[Ampere1BWrite_1cyc_1A],  (instrs SETF8, SETF16)>;
971def : InstRW<[Ampere1BWrite_1cyc_1AB],
972        (instrs MOVKWi, MOVKXi, MOVNWi, MOVNXi, MOVZWi, MOVZXi)>;
973def : InstRW<[Ampere1BWrite_1cyc_1B],
974        (instregex "(RBIT|REV|REV16)(W|X)r", "REV32Xr")>;
975def : InstRW<[Ampere1BWrite_1cyc_1B],
976        (instregex "(ASR|LSL|LSR|ROR)V(W|X)r")>;
977
978// Integer store instructions
979def : InstRW<[Ampere1BWrite_1cyc_2S],        (instregex "STNP(X|W)i")>;
980def : InstRW<[Ampere1BWrite_1cyc_2S],        (instrs STPXi)>;
981def : InstRW<[Ampere1BWrite_2cyc_1B_1S],     (instrs STPWi)>;
982def : InstRW<[Ampere1BWrite_2cyc_1B_1S_1AB], (instregex "STP(W|X)(pre|post)")>;
983def : InstRW<[Ampere1BWrite_1cyc_1S],        (instrs STTRBi, STTRHi, STTRWi, STTRXi)>;
984def : InstRW<[Ampere1BWrite_1cyc_1S],        (instregex "STUR(BB|HH|X|W)i",
985                                                        "STR(X|W)ui",
986                                                        "STUR(BB|HH|X|W)i")>;
987def : InstRW<[Ampere1BWrite_1cyc_2S],        (instrs STRWroX, STRXroX)>;
988def : InstRW<[Ampere1BWrite_1cyc_2S],        (instrs STRWroW, STRXroW)>;
989
990// Memory tagging
991
992// Insert Random Tags
993def : InstRW<[Ampere1BWrite_1cyc_1BS_1B], (instrs IRG, IRGstack)>;
994// Load allocation tag
995def : InstRW<[Ampere1BWrite_4cyc_1L_1B], (instrs LDG, LDGM)>;
996// Store allocation tags
997def : InstRW<[Ampere1BWrite_1cyc_1S],
998    (instrs STGi, STGM, STGPreIndex, STGPostIndex)>;
999// Store allocation tags and pair of registers
1000def : InstRW<[Ampere1BWrite_1cyc_2S],
1001    (instrs STGPi, STGPpre, STGPpost)>;
1002// Store allocation tags and zero data
1003def : InstRW<[Ampere1BWrite_1cyc_1S],
1004    (instrs STZGi, STZGM, STZGPreIndex, STZGPostIndex)>;
1005// Store two tags
1006def : InstRW<[Ampere1BWrite_1cyc_2S],
1007    (instrs ST2Gi, ST2GPreIndex, ST2GPostIndex)>;
1008// Store two tags and zero data
1009def : InstRW<[Ampere1BWrite_1cyc_2S],
1010    (instrs STZ2Gi, STZ2GPreIndex, STZ2GPostIndex)>;
1011// Subtract Pointer
1012def : InstRW<[Ampere1BWrite_1cyc_1AB], (instrs SUBP)>;
1013// Subtract Pointer, flagset
1014def : InstRW<[Ampere1BWrite_1cyc_1AB], (instrs SUBPS)>;
1015// Insert Tag Mask
1016def : InstRW<[Ampere1BWrite_1cyc_1AB], (instrs GMI)>;
1017// Arithmetic, immediate to logical address tag
1018def : InstRW<[Ampere1BWrite_1cyc_1B], (instrs ADDG, SUBG)>;
1019
1020// Pointer authentication
1021def : InstRW<[Ampere1BWrite_5cyc_1BS], (instregex "^AUT")>;
1022def : InstRW<[Ampere1BWrite_6cyc_1BS_1A],
1023        (instregex "BRA(A|AZ|B|BZ)", "RETA(A|B)", "ERETA(A|B)")>;
1024def : InstRW<[Ampere1BWrite_6cyc_1BS_2A],
1025        (instrs BLRAA, BLRAAZ, BLRAB, BLRABZ)>;
1026def : InstRW<[Ampere1BWrite_5cyc_1BS], (instregex "^PAC")>;
1027def : InstRW<[Ampere1BWrite_8cyc_1BS_1L], (instregex "^LDRA(A|B)")>;
1028def : InstRW<[Ampere1BWrite_1cyc_1B], (instrs XPACD, XPACI)>;
1029
1030// Vector integer instructions
1031// -- absolute difference
1032def : InstRW<[Ampere1BWrite_2cyc_1XY],
1033             (instregex "^SABAv", "^SABALv", "^SABDv", "^SABDLv",
1034                        "^UABAv", "^UABALv", "^UABDv", "^UABDLv")>;
1035// -- arithmetic
1036def : InstRW<[Ampere1BWrite_2cyc_1XY],
1037        (instregex "^ABSv", "^(ADD|SUB)v", "^SADDLv", "^SADDW", "SHADD",
1038                   "SHSUB", "^SRHADD", "^URHADD", "SSUBL", "SSUBW",
1039                   "^UADDLv", "^UADDW", "UHADD", "UHSUB", "USUBL", "USUBW")>;
1040// -- arithmetic, horizontal, 16B
1041def : InstRW<[Ampere1BWrite_8cyc_4XY],
1042            (instregex "^ADDVv16i8v", "^SADDLVv16i8v", "^UADDLVv16i8v")>;
1043def : InstRW<[Ampere1BWrite_8cyc_4XY],
1044            (instregex "^[SU](MIN|MAX)Vv16i8v")>;
1045// -- arithmetic, horizontal, 4H/4S
1046def : InstRW<[Ampere1BWrite_4cyc_2XY],
1047            (instregex "^[SU]?ADDL?V(v8i8|v4i16|v2i32)v")>;
1048def : InstRW<[Ampere1BWrite_4cyc_2XY],
1049            (instregex "^[SU](MIN|MAX)V(v4i16|v4i32)v")>;
1050// -- arithmetic, horizontal, 8B/8H
1051def : InstRW<[Ampere1BWrite_6cyc_3XY],
1052            (instregex "^[SU]?ADDL?V(v8i16|v4i32)v")>;
1053def : InstRW<[Ampere1BWrite_6cyc_3XY],
1054            (instregex "^[SU](MIN|MAX)V(v8i8|v8i16)v")>;
1055// -- arithmetic, narrowing
1056def : InstRW<[Ampere1BWrite_6cyc_2XY], (instregex "(ADD|SUB)HNv.*")>;
1057def : InstRW<[Ampere1BWrite_6cyc_2XY], (instregex "(RADD|RSUB)HNv.*")>;
1058// -- arithmetic, pairwise
1059def : InstRW<[Ampere1BWrite_2cyc_1XY],
1060        (instregex "^ADDPv", "^SADALP", "^UADALP", "^SADDLPv", "^UADDLPv")>;
1061// -- arithmetic, saturating
1062def : InstRW<[Ampere1BWrite_2cyc_1XY],
1063        (instregex "^SQADD", "^SQSUB", "^SUQADD", "^UQADD", "^UQSUB", "^USQADD")>;
1064// -- bit count
1065def : InstRW<[Ampere1BWrite_2cyc_1XY],
1066        (instregex "^(CLS|CLZ|CNT)v")>;
1067// -- compare
1068def : InstRW<[Ampere1BWrite_2cyc_1XY],
1069        (instregex "^CMEQv", "^CMGEv", "^CMGTv", "^CMLEv", "^CMLTv",
1070                   "^CMHIv", "^CMHSv")>;
1071// -- compare non-zero
1072def : InstRW<[Ampere1BWrite_2cyc_1XY], (instregex "^CMTSTv")>;
1073// -- dot product
1074def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^(S|SU|U|US)DOTv")>;
1075// -- fp reciprocal estimate
1076def : InstRW<[Ampere1BWrite_6cyc_1X], (instregex "^FRECPEv", "^FRSQRTEv")>;
1077// -- integer reciprocal estimate
1078def : InstRW<[Ampere1BWrite_2cyc_1XY], (instregex "^URECPEv", "^URSQRTEv")>;
1079// -- logical
1080def : InstRW<[Ampere1BWrite_2cyc_1XY],
1081        (instregex "^ANDv", "^BICv", "^EORv", "^ORRv", "^ORNv", "^NOTv")>;
1082// -- logical, narrowing
1083def : InstRW<[Ampere1BWrite_6cyc_2XY],
1084        (instregex "RSHRNv",
1085                   "SHRNv", "SQSHRNv", "SQSHRUNv",
1086                   "UQXTNv")>;
1087// -- matrix multiply
1088def : InstRW<[Ampere1BWrite_3cyc_1XY],
1089        (instrs SMMLA, UMMLA, USMMLA)>;
1090// -- max/min
1091def : InstRW<[Ampere1BWrite_2cyc_1XY],
1092        (instregex "^SMAXv", "^SMINv", "^UMAXv", "^UMINv")>;
1093def : InstRW<[Ampere1BWrite_2cyc_1XY],
1094        (instregex "^SMAXPv", "^SMINPv", "^UMAXPv", "^UMINPv")>;
1095// -- move immediate
1096def : InstRW<[Ampere1BWrite_2cyc_1XY], (instregex "^MOVIv", "^MVNIv")>;
1097// -- multiply
1098def : InstRW<[Ampere1BWrite_3cyc_1XY],
1099        (instregex "MULv", "SMULLv", "UMULLv", "SQDMUL(H|L)v", "SQRDMULHv")>;
1100// -- multiply accumulate
1101def : InstRW<[Ampere1BWrite_3cyc_1XY],
1102        (instregex "MLAv", "MLSv", "(S|U|SQD)(MLAL|MLSL)v", "SQRDML(A|S)Hv")>;
1103// -- negation, saturating
1104def : InstRW<[Ampere1BWrite_2cyc_1XY], (instregex "^SQABS", "^SQNEG")>;
1105// -- reverse bits/bytes
1106def : InstRW<[Ampere1BWrite_2cyc_1XY],
1107        (instregex "^RBITv", "^REV16v", "^REV32v", "^REV64v")>;
1108// -- shift
1109def : InstRW<[Ampere1BWrite_2cyc_1XY], (instregex "^[SU]SHL(v16i8|v8i16|v4i32|v2i64)")>;
1110// -- shift and accumulate
1111def : InstRW<[Ampere1BWrite_2cyc_1XY],
1112        (instregex "SRSRAv", "SSRAv", "URSRAv", "USRAv")>;
1113// -- shift, saturating
1114def : InstRW<[Ampere1BWrite_2cyc_1XY],
1115        (instregex "^SQRSHLv", "^SQRSHRNv", "^SQRSHRUNv", "^SQSHL", "^SQSHLU",
1116                   "^SQXTNv", "^SQXTUNv", "^UQSHRNv", "UQRSHRNv", "^UQRSHL",
1117                   "^UQSHL")>;
1118
1119// Vector miscellaneous instructions
1120// -- duplicate element
1121def : InstRW<[Ampere1BWrite_2cyc_1XY], (instregex "^DUPv.+lane")>;
1122// -- duplicate from GPR
1123def : InstRW<[Ampere1BWrite_5cyc_1BS], (instregex "^DUPv.+gpr")>;
1124// -- extract narrow
1125def : InstRW<[Ampere1BWrite_2cyc_1XY], (instregex "^XTNv")>;
1126// -- insert/extract element
1127def : InstRW<[Ampere1BWrite_2cyc_1XY], (instregex "^EXTv", "^INSv.+lane")>;
1128// -- move FP immediate
1129def : InstRW<[Ampere1BWrite_2cyc_1XY], (instregex "^FMOVv")>;
1130// -- move element to GPR
1131def : InstRW<[Ampere1BWrite_5cyc_1X], (instregex "(S|U)MOVv")>;
1132// -- move from GPR to any element
1133def : InstRW<[Ampere1BWrite_7cyc_1BS_1XY], (instregex "^INSv.+gpr")>;
1134// -- table lookup
1135def : InstRW<[Ampere1BWrite_2cyc_1XY],
1136            (instrs TBLv8i8One, TBLv16i8One, TBXv8i8One, TBXv16i8One)>;
1137def : InstRW<[Ampere1BWrite_4cyc_2XY],
1138            (instrs TBLv8i8Two, TBLv16i8Two, TBXv8i8Two, TBXv16i8Two)>;
1139def : InstRW<[Ampere1BWrite_6cyc_3XY],
1140            (instrs TBLv8i8Three, TBLv16i8Three, TBXv8i8Three, TBXv16i8Three)>;
1141def : InstRW<[Ampere1BWrite_8cyc_4XY],
1142            (instrs TBLv8i8Four, TBLv16i8Four, TBXv8i8Four, TBXv16i8Four)>;
1143// -- transpose
1144def : InstRW<[Ampere1BWrite_2cyc_1XY],
1145              (instregex "^TRN1v", "^TRN2v", "^UZP1v", "^UZP2v")>;
1146// -- zip/unzip
1147def : InstRW<[Ampere1BWrite_2cyc_1XY], (instregex "^ZIP1v", "^ZIP2v")>;
1148
1149} // SchedModel = Ampere1BModel
1150