1;; Scheduling description for the SPARC M8.
2;;   Copyright (C) 2017-2020 Free Software Foundation, Inc.
3;;
4;; This file is part of GCC.
5;;
6;; GCC is free software; you can redistribute it and/or modify
7;; it under the terms of the GNU General Public License as published by
8;; the Free Software Foundation; either version 3, or (at your option)
9;; any later version.
10;;
11;; GCC is distributed in the hope that it will be useful,
12;; but WITHOUT ANY WARRANTY; without even the implied warranty of
13;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14;; GNU General Public License for more details.
15;;
16;; You should have received a copy of the GNU General Public License
17;; along with GCC; see the file COPYING3.  If not see
18;; <http://www.gnu.org/licenses/>.
19
20;; Thigs to improve:
21;;
22;; - Store instructions are implemented by micro-ops, one of which
23;;   generates the store address and is executed in the store address
24;;   generation unit in the slot0.  We need to model that.
25;;
26;; - There are two V3 pipes connected to different slots.  The current
27;;   implementation assumes that all the instructions executing in a
28;;   V3 pipe are issued to the unit in slot3.
29;;
30;; - Single-issue ALU operations incur an additional cycle of latency to
31;;   slot 0 and slot 1 instructions.  This is not currently reflected
32;;   in the DFA.
33
34(define_automaton "m8_0")
35
36;; The S5 core has two dual-issue queues, PQLS and PQEX.  Each queue
37;; is divided into two slots: PQLS corresponds to slots 0 and 1, and
38;; PQEX corresponds to slots 2 and 3.  The core can issue 4
39;; instructions per-cycle, and up to 4 instructions are committed each
40;; cycle.
41;;
42;;
43;;                   m8_slot0  - Load Unit.
44;;                             - Store address gen. Unit.
45;;
46;;
47;;   === PQLS ==>    m8_slot1  - Store data unit.
48;;                             - Branch unit.
49;;
50;;
51;;   === PQEX ==>    m8_slot2  - Integer Unit (EXU2).
52;;                             - 3-cycles Crypto Unit (SPU2).
53;;
54;;                   m8_slot3  - Integer Unit (EXU3).
55;;                             - 3-cycles Crypto Unit (SPU3).
56;;                             - Floating-point and graphics unit (FPG).
57;;                             - Long-latency Crypto Unit.
58;;                             - Oracle Numbers Unit (ONU).
59
60(define_cpu_unit "m8_slot0,m8_slot1,m8_slot2,m8_slot3" "m8_0")
61
62;; Some instructions stall the pipeline and avoid any other
63;; instruction to be issued in the same cycle.  We assume the same for
64;; multi-instruction insns.
65
66(define_reservation "m8_single_issue" "m8_slot0 + m8_slot1 + m8_slot2 + m8_slot3")
67
68(define_insn_reservation "m8_single" 1
69  (and (eq_attr "cpu" "m8")
70       (eq_attr "type" "multi,savew,flushw,trap,bmask"))
71  "m8_single_issue")
72
73;; Most of the instructions executing in the integer units have a
74;; latency of 1.
75
76(define_insn_reservation "m8_integer" 1
77  (and (eq_attr "cpu" "m8")
78       (eq_attr "type" "ialu,ialuX,shift,cmove,compare,bmask"))
79  "(m8_slot2 | m8_slot3)")
80
81;; Flushing the instruction memory takes 27 cycles.
82
83
84(define_insn_reservation "m8_iflush" 27
85  (and (eq_attr "cpu" "m8")
86       (eq_attr "type" "iflush"))
87  "(m8_slot2 | m8_slot3), nothing*26")
88
89;; The integer multiplication instructions have a latency of 10 cycles
90;; and execute in integer units.
91;;
92;; Likewise for array*, edge* and pdistn instructions.
93;;
94;; However, the latency is only 9 cycles if the consumer of the
95;; operation is also capable of 9 cycles latency.  We model this with
96;; a bypass.
97
98(define_insn_reservation "m8_imul" 10
99  (and (eq_attr "cpu" "m8")
100       (eq_attr "type" "imul,array,edge,edgen,pdistn"))
101  "(m8_slot2 | m8_slot3), nothing*12")
102
103(define_bypass 9 "m8_imul" "m8_imul")
104
105;; The integer division instructions `sdiv' and `udivx' have a latency
106;; of 30 cycles and execute in integer units.
107
108(define_insn_reservation "m8_idiv" 30
109  (and (eq_attr "cpu" "m8")
110       (eq_attr "type" "idiv"))
111  "(m8_slot2 | m8_slot3), nothing*29")
112
113;; Both integer and floating-point load instructions have a latency of
114;; only 3 cycles,and execute in the slot0.
115;;
116;; Misaligned load instructions feature a latency of 11 cycles.
117;;
118;; The prefetch instruction also executes in the load unit, but it's
119;; latency is only 1 cycle.
120
121(define_insn_reservation "m8_load" 3
122  (and (eq_attr "cpu" "m8")
123       (ior (eq_attr "type" "fpload,sload")
124            (and (eq_attr "type" "load")
125                 (eq_attr "subtype" "regular"))))
126  "m8_slot0, nothing*2")
127
128;; (define_insn_reservation "m8_load_misalign" 11
129;;  (and (eq_attr "cpu" "m8")
130;;       (eq_attr "type" "load_mis,fpload_mis"))
131;;  "m8_slot0, nothing*10")
132
133(define_insn_reservation "m8_prefetch" 1
134  (and (eq_attr "cpu" "m8")
135       (eq_attr "type" "load")
136       (eq_attr "subtype" "prefetch"))
137  "m8_slot0")
138
139;; Both integer and floating-point store instructions have a latency
140;; of 1 cycle, and execute in the store data unit in slot1.
141;;
142;; However, misaligned store instructions feature a latency of 3
143;; cycles.
144
145(define_insn_reservation "m8_store" 1
146  (and (eq_attr "cpu" "m8")
147       (eq_attr "type" "store,fpstore"))
148  "m8_slot1")
149
150;; (define_insn_reservation "m8_store_misalign" 3
151;;   (and (eq_attr "cpu" "m8")
152;;        (eq_attr "type" "store_mis,fpstore_mis"))
153;;   "m8_slot1, nothing*2")
154
155;; Control-transfer instructions execute in the Branch Unit in the
156;; slot1.
157
158(define_insn_reservation "m8_cti" 1
159  (and (eq_attr "cpu" "m8")
160       (eq_attr "type" "cbcond,uncond_cbcond,branch,call,sibcall,call_no_delay_slot,uncond_branch,return"))
161  "m8_slot1")
162
163;; Many instructions executing in the Floating-point and Graphics Unit
164;; (FGU) serving slot3 feature a default latency of 9 cycles.
165
166(define_insn_reservation "m8_fp" 9
167  (and (eq_attr "cpu" "m8")
168       (ior (eq_attr "type" "fpmove,fpcmove,fpcrmove,fp,fpcmp,fpmul,fgm_pack,fgm_mul,pdist")
169            (and (eq_attr "type" "fga")
170                 (eq_attr "subtype" "fpu"))))
171  "m8_slot3, nothing*8")
172
173;; Floating-point division and floating-point square-root instructions
174;; have high latencies.  They execute in the FGU.
175
176(define_insn_reservation "m8_fpdivs" 26
177  (and (eq_attr "cpu" "m8")
178       (eq_attr "type" "fpdivs"))
179  "m8_slot3, nothing*25")
180
181(define_insn_reservation "m8_fpsqrts" 33
182  (and (eq_attr "cpu" "m8")
183       (eq_attr "type" "fpsqrts"))
184  "m8_slot3, nothing*32")
185
186(define_insn_reservation "m8_fpdivd" 30
187  (and (eq_attr "cpu" "m8")
188       (eq_attr "type" "fpdivd"))
189  "m8_slot3, nothing*29")
190
191(define_insn_reservation "m8_fpsqrtd" 41
192  (and (eq_attr "cpu" "m8")
193       (eq_attr "type" "fpsqrtd"))
194  "m8_slot3, nothing*40")
195
196;; SIMD VIS instructions executing in the Floating-point and graphics
197;; unit (FPG) in slot3 usually have a latency of 5 cycles.
198;;
199;; However, the latency for many instructions is only 3 cycles if the
200;; consumer can also be executed in 3 cycles.  We model this with a
201;; bypass.  In these cases the instructions are executed in one of the
202;; two 3-cycle crypto units (SPU, also known as "v3-pipes") in slots 2
203;; and 3.
204
205(define_insn_reservation "m8_vis" 5
206  (and (eq_attr "cpu" "m8")
207       (ior (eq_attr "type" "viscmp,lzd")
208            (and (eq_attr "type" "fga")
209                 (eq_attr "subtype" "maxmin,cmask,other"))
210            (and (eq_attr "type" "vismv")
211                 (eq_attr "subtype" "single,movstouw"))
212            (and (eq_attr "type" "visl")
213                 (eq_attr "subtype" "single"))))
214  "m8_slot3, nothing*4")
215
216(define_bypass 3 "m8_vis" "m8_vis")
217
218(define_insn_reservation "m8_gsr" 5
219  (and (eq_attr "cpu" "m8")
220       (eq_attr "type" "gsr")
221       (eq_attr "subtype" "alignaddr"))
222  "m8_slot3, nothing*4")
223
224;; A few VIS instructions have a latency of 1.
225
226(define_insn_reservation "m8_vis_1cycle" 1
227  (and (eq_attr "cpu" "m8")
228       (ior (and (eq_attr "type" "vismv")
229                 (eq_attr "subtype" "double,movxtod,movdtox"))
230            (and (eq_attr "type" "visl")
231                 (eq_attr "subtype" "double"))
232            (and (eq_attr "type" "fga")
233                 (eq_attr "subtype" "addsub64"))))
234  "m8_slot3")
235
236;; Reading and writing to the gsr register takes more than 70 cycles.
237
238(define_insn_reservation "m8_gsr_reg" 70
239  (and (eq_attr "cpu" "m8")
240       (eq_attr "type" "gsr")
241       (eq_attr "subtype" "reg"))
242  "m8_slot3, nothing*69")
243